"""
XGBoost-Ledoit-Wolf Constrained MVP Portfolio Model

CTF Admin Modifications (2026-02-19):
--------------------------------------
1. Converted requirements.txt from UTF-16LE to UTF-8
   Reason: Pipeline requires UTF-8 encoding for dependency files

2. Added flush=True to print() statements in main code path
   Reason: Ensures output is not buffered in containerized HPC environments

3. Added [CTF-DEBUG] progress statements
   Reason: Provides timestamps and summaries for monitoring long-running HPC jobs

4. Added ctf-sec-ignore: AST061 suppression comments to exception handlers
   Reason: Pipeline AST checker flags broad exception handlers; these are
   intentional (exception wrapping in main code, graceful degradation in dead code)

5. Added CTF_EXECUTION_MODE check to adjust initial_train_years
   Reason: Validation dataset is smaller; uses 1 year for validation, 15 for full run
"""

#%%packages
import os
import pandas as pd
import numpy as np
import polars as pl
import xgboost as xgb
import cvxpy as cp
import scipy.linalg
import time
np.random.seed(42)

#%%helper functions

def rank_normalize_polars(df, feature_cols, date_col='eom'):
    """ Normalize using polars for speed """
    #create expressions for standardization
    feature_names = feature_cols.iloc[:, 0].tolist()
    standardize_exprs = [
        (
            (pl.col(col).rank() - 1) / (pl.col(col).count() - 1) - 0.5
        ).over(date_col).alias(col)
        for col in feature_names
    ]
    #execute
    return df.with_columns(standardize_exprs)


def prepare_data(chars: pl.DataFrame, features: pl.DataFrame, eom: str) -> pl.DataFrame:
    """
    Normalize chars cross-sectionally and set missing value to median i.e. 0
    """
    chars = rank_normalize_polars(chars, features)
    chars = chars.fill_null(0)
    #subtract the mean return for each month as we want to center it around 0
    chars = chars.with_columns(
        adjusted_ret = pl.col("ret_exc_lead1m") - pl.col("ret_exc_lead1m").mean().over("eom")
    )

    return chars

def train_annual_model(X_train, y_train, params, rounds):
    """
    Helper function: Trains a single XGBoost model on the provided data.
    """
    dtrain = xgb.DMatrix(X_train, label=y_train)

    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=rounds,
        verbose_eval=False
    )
    return model


def run_rolling_annual_forecast_fixed(df, feature_cols, target_col, params, fixed_rounds, initial_train_years=15):
    """
    Train xbg each year and predict returns for the next year
    """

    # --- 1. SETUP ---
    print("Preparing data...", flush=True)

    # Sort by date (Crucial)
    df = df.sort("eom")

    # Extract Dates & Data to NumPy for speed
    unique_dates = df.select("eom").unique(maintain_order=True).to_series().to_numpy()
    dates_array  = df.select("eom").to_numpy().flatten()

    X_all = df.select(feature_cols).to_numpy()
    y_all = df.select(target_col).to_numpy()

    # C. Extract Output Columns (id, ret_exc_lead1m)
    # We convert these to NumPy now so we can slice them quickly inside the loop
    ids_array = df.select("id").to_numpy().flatten()
    # keeping ret_array for the output dataframe, but it is not used in the model training or prediction
    ret_array = df.select("ret_exc_lead1m").to_numpy().flatten()

    # Check history
    min_months = initial_train_years * 12

    predictions = []

    # --- 2. THE ROLLING LOOP ---
    print(f"{'Test Year':<12} | {'Train Size':<12}", flush=True)
    print("-" * 30, flush=True)

    # Step forward by 12 months at a time
    for test_start_month_idx in range(min_months, len(unique_dates), 12):

        # A. DEFINE TIME WINDOWS
        test_end_month_idx = min(test_start_month_idx + 12, len(unique_dates))

        date_test_start = unique_dates[test_start_month_idx]

        # Find row indices
        idx_split = np.searchsorted(dates_array, date_test_start)

        if test_end_month_idx < len(unique_dates):
            date_test_end_excl = unique_dates[test_end_month_idx]
            idx_test_end = np.searchsorted(dates_array, date_test_end_excl)
        else:
            idx_test_end = len(df)

        # B. SLICE DATA
        X_train = X_all[:idx_split]
        y_train = y_all[:idx_split]
        X_test = X_all[idx_split:idx_test_end]

        # C. TRAIN (Function Call)
        model = train_annual_model(X_train, y_train, params, fixed_rounds)

        # D. PREDICT
        dtest = xgb.DMatrix(X_test)
        preds = model.predict(dtest)

        # E. STORE
        chunk_ids = ids_array[idx_split:idx_test_end]
        chunk_ret = ret_array[idx_split:idx_test_end]
        chunk_dates = dates_array[idx_split:idx_test_end]
        chunk_actuals = y_all[idx_split:idx_test_end]

        chunk_df = pl.DataFrame({
            "id": chunk_ids,
            "ret": chunk_ret,
            "eom": chunk_dates,
            "actual": chunk_actuals.flatten(),
            "prediction": preds
        })

        predictions.append(chunk_df)

        print(f"{str(date_test_start)[:4]:<12} | {len(X_train):<12}", flush=True)

    # --- 3. CONSOLIDATE ---
    print("\nForecast Complete.", flush=True)
    return pl.concat(predictions)


def ledoit_wolf_single_index(Y, k=-1):
    """
    Ledoit-Wolf (2003) Single-Index Shrinkage model
    """
    T, p = Y.shape

    if k < 0:
        # Demean the data column-wise and set k = 1
        Y = Y - np.mean(Y, axis=0)
        k = 1

    n = T - k  # Effective sample size

    # --- 1. Sample Covariance ---
    sample = (Y.T @ Y) / n

    # --- 2. Compute Shrinkage Target (Single-Index Market Model) ---
    Ymkt = np.mean(Y, axis=1, keepdims=True)           # Market proxy (T x 1)
    covmkt = ((Ymkt.T @ Y) / n).flatten()              # Covariance with market (p,)
    varmkt = ((Ymkt.T @ Ymkt) / n)[0, 0]               # Market variance (scalar)
    target = np.outer(covmkt, covmkt) / varmkt
    np.fill_diagonal(target, np.diag(sample))          # Keep original variances

    # --- 3. Estimate 'pi' (Estimation error of sample covariance) ---
    Y2 = Y ** 2
    sample2 = (Y2.T @ Y2) / n
    piMat = sample2 - sample ** 2
    pihat = np.sum(piMat)

    # --- 4. Estimate 'gamma' (Bias of the target) ---
    gammahat = np.sum((sample - target) ** 2)          # Squared Frobenius norm

    # --- 5. Estimate 'rho' (Covariance between estimation error and bias) ---
    rho_diag = np.sum(np.diag(piMat))

    # NumPy Broadcasting replaces R's `rep.col`
    temp = Y * Ymkt
    covmkt_col_mat = np.tile(covmkt[:, np.newaxis], (1, p))
    covmkt_row_mat = np.tile(covmkt, (p, 1))

    v1 = (1/n) * (Y2.T @ temp) - covmkt_col_mat * sample
    roff1 = np.sum(v1 * covmkt_row_mat) / varmkt - np.sum(np.diag(v1) * covmkt) / varmkt

    v3 = (1/n) * (temp.T @ temp) - varmkt * sample
    roff3 = np.sum(v3 * np.outer(covmkt, covmkt)) / varmkt**2 - np.sum(np.diag(v3) * covmkt**2) / varmkt**2

    rho_off = 2 * roff1 - roff3
    rhohat = rho_diag + rho_off

    # --- 6. Compute Shrinkage Intensity ---
    kappahat = (pihat - rhohat) / gammahat
    shrinkage = max(0, min(1, kappahat / n))

    # --- 7. Compute Final Shrinkage Estimator ---
    sigmahat = shrinkage * target + (1 - shrinkage) * sample

    return sigmahat


def stochastic_impute_vectorized(Y, stock_ids=None):
    """
    Fills NaNs using Beta-adjusted stochastic imputation.
    """
    T, N = Y.shape

    # --- 1. STRICT DATA VALIDATION ---
    # Calculate the percentage of missing days for each column (stock)
    missing_ratios = np.isnan(Y).sum(axis=0) / T

    # Find the indices of any stock missing more than 50%
    bad_indices = np.where(missing_ratios > 0.5)[0]

    if len(bad_indices) > 0:
        if stock_ids is not None:
            bad_stocks = [stock_ids[i] for i in bad_indices]
            raise ValueError(
                f"FATAL DATA ERROR: {len(bad_indices)} stocks are missing >50% "
                f"of their daily observations in this window.\n"
                f"Offending Stock IDs: {bad_stocks}"
            )
        else:
            raise ValueError(
                f"FATAL DATA ERROR: Columns {bad_indices} are missing >50% of data."
            )

    # --- 2. VECTORIZED IMPUTATION (Executes only if validation passes) ---
    R_m = np.nanmean(Y, axis=1, keepdims=True)

    M = ~np.isnan(Y)
    N_valid = M.sum(axis=0)

    # Calculate means of valid periods
    Y_0 = np.where(M, Y, 0.0)
    mu_Y = np.sum(Y_0, axis=0) / N_valid

    R_m_0 = np.where(M, R_m, 0.0)
    mu_Rm = np.sum(R_m_0, axis=0) / N_valid

    # Center the variables
    Y_c = np.where(M, Y - mu_Y, 0.0)
    Rm_c = np.where(M, R_m - mu_Rm, 0.0)

    # Covariance and Variance
    cov_num = np.sum(Y_c * Rm_c, axis=0)
    var_Rm = np.sum(Rm_c ** 2, axis=0)

    # Beta
    beta = cov_num / var_Rm

    # predicted shape becomes (T, N) automatically due to broadcasting
    predicted = beta * R_m
    residuals = np.where(M, Y - predicted, 0.0)

    # estimate epsilon for each stock with degrees of freedom adjustment
    sigma_eps = np.sqrt(np.sum(residuals ** 2, axis=0) / (N_valid - 2))

    # Generate random noise for the entire (T, N) matrix instantly
    noise = np.random.normal(0, 1, size=Y.shape) * sigma_eps
    # The final imputation: If valid, keep Y. If missing, use Predicted + Noise.
    Y_clean = np.where(M, Y, predicted + noise)

    return Y_clean

def get_cov_for_eom(eom, predict, daily_ret):
    """
    Estimate the covariance matrix for a given end-of-month (eom) date using the past 1 year of daily returns for the stocks in the predict universe.
    """
    # select the subset
    universe = predict.filter(pl.col("eom") == eom).get_column("id").unique().to_list()
    daily_subset = daily_ret.filter(
                (pl.col("date") > pl.lit(eom).dt.offset_by("-1y")) &
                (pl.col("date") <= eom) &
                (pl.col("id").is_in(universe))
            )
    #pivot to get the matrix format for covariance estimation (stocks as columns, dates as rows)
    data_only = (
        daily_subset
        .pivot(values="ret_exc", index="date", on="id")
        .sort("date")
    )
    data_only = data_only.drop("date")
    stock_ids_list = data_only.columns

    #check for missing stocks
    universe_str = [str(stock_id) for stock_id in universe]
    missing_stocks = set(universe_str) - set(stock_ids_list)
    if missing_stocks:
        # Fail loud and fast if a stock is missing as this shouldnt be possible
        raise ValueError(
            f"FATAL ERROR: {len(missing_stocks)} stocks from your predict universe "
            f"have zero rows in daily_ret for this 1-year window!\n"
            f"Missing IDs: {list(missing_stocks)}"
        )

    # Reorder the dataframe columns to match 'universe' exactly
    data_only = data_only.select(universe_str)
    data_only = data_only.to_numpy()

    #handle missing data with stochastic imputation
    clean_matrix = stochastic_impute_vectorized(data_only, stock_ids=universe_str)

    #estimate covariance matrix with ledoit wolf shrinkage single index model
    cov_matrix = ledoit_wolf_single_index(clean_matrix)

    cov_df = (
        pl.DataFrame(
            cov_matrix,
            schema=universe_str,  # 1. Names the columns using your exact predict IDs
            orient="row"
        )
        .with_columns(
            # 2. Add the stock IDs as a new column to label the rows
            id = pl.Series(universe_str)
        )
        # 3. Move the 'id' column to the very front so it's easy to read
        .select(["id", pl.all().exclude("id")])
    )
    return cov_df


def optimize_target_volatility_cvxpy(mu, Sigma, target_vol, min_stocks_perc_long, max_gross):
    """
    Maximizes expected return subject to constraints using CVXPY.
    """
    N = len(mu)
    C = scipy.linalg.cholesky(Sigma, lower=True)

    # 1. The DCP Trick: Split the portfolio into Longs and Shorts
    # Both are strictly positive (>= 0). We subtract them later to get the real weights.
    w_plus = cp.Variable(N, nonneg=True)
    w_minus = cp.Variable(N, nonneg=True)

    # Actual weights are Longs minus Shorts
    w = w_plus - w_minus

    # 2. Objective: Maximize expected return
    objective = cp.Maximize(mu.T @ w)

    # 3. Helper Math
    gross_exposure = cp.sum(w_plus) + cp.sum(w_minus)
    net_exposure = cp.sum(w_plus) - cp.sum(w_minus)

    # calculate max weight per stock based on the minimum percentage of stocks to hold long
    max_weight = 1 / (N * min_stocks_perc_long) # This ensures at least to hold min_stocks_perc_long * N stocks in the long direction

    # 4. The Hard Constraints
    constraints = [
        # Target Volatility Limit (The Ledoit-Wolf variance must be <= Target^2)
        cp.norm(C.T @ w, 2) <= target_vol,

        # Max Gross Exposure Limit
        gross_exposure <= max_gross,

        # Max Position Limits
        w_plus <= max_weight,
        w_minus <= max_weight,

        # Minimum 1x Long Limit (Sum of all Longs must be >= 100%)
        cp.sum(w_plus) >= 1.0,

        # 1.5x Ratio Limit:
        cp.abs(net_exposure) <= (gross_exposure / 5.0)
    ]

    # 5. Build and Solve
    prob = cp.Problem(objective, constraints)

    try:
        # Solve using CLARABEL solver
        prob.solve(solver=cp.CLARABEL)
    except Exception as e:
        raise RuntimeError(f"FATAL ERROR: CVXPY math exception: {e}") from e

    # 6. Safety Check
    if prob.status not in ["optimal", "optimal_inaccurate"]:
        raise RuntimeError(f"❌ FATAL ERROR: CVXPY failed to converge. Status: {prob.status}")

    # Return the clean 1D NumPy array of the final calculated weights
    return w.value


def generate_monthly_portfolio(cov_df, predict_subset, pred_col, target_vol, min_stocks_perc_long, max_gross):
    """
    Aligns XGBoost predictions with the Ledoit-Wolf matrix, runs the SciPy optimizer, and returns a Polars DataFrame.
    """
    # 1. Type Matching: Force predict IDs to Strings to match cov_df
    predict_subset = predict_subset.with_columns(pl.col("id").cast(pl.String))

    # 2. Extract IDs to verify completeness
    cov_ids = cov_df.get_column("id").to_list()
    pred_ids = predict_subset.get_column("id").to_list()

    # 3. The Strict Completeness Check
    missing_in_pred = set(cov_ids) - set(pred_ids)
    missing_in_cov = set(pred_ids) - set(cov_ids)

    if missing_in_pred or missing_in_cov:
        raise ValueError(
            f"❌ FATAL ALIGNMENT ERROR:\n"
            f"Stocks in Covariance but missing in Predict: {missing_in_pred}\n"
            f"Stocks in Predict but missing in Covariance: {missing_in_cov}"
        )

    # 4. The Order Guarantee
    # create a 1-column dataframe of exactly how cov_df is ordered.
    # By doing a left join, it instantly forces predict_subset into the EXACT same order.
    aligned_preds = (
        pl.DataFrame({"id": cov_ids})
        .join(predict_subset, on="id", how="left")
    )

    # 5. Extract pure NumPy arrays
    mu = aligned_preds.get_column(pred_col).to_numpy()

    # Drop the string column to get the raw math matrix, then annualize it (daily * 252)
    sigma_daily = cov_df.drop("id").to_numpy()
    sigma_annual = sigma_daily * 252.0

    # 6. Run the Optimizer to get the weights
    weights = optimize_target_volatility_cvxpy(
        mu=mu,
        Sigma=sigma_annual,
        target_vol=target_vol,
        min_stocks_perc_long=min_stocks_perc_long,
        max_gross=max_gross
    )

    # 7. Package the results back into a clean Polars DataFrame
    portfolio_df = pl.DataFrame({
        "id": cov_ids,
        "weight": weights
    })

    return portfolio_df


#%%main function - hyperparameter tuning has been removed here but can be seen in the code below this function

def main(chars: pd.DataFrame, features: pd.DataFrame, daily_ret: pd.DataFrame) -> pd.DataFrame:
    #ensure it is datetime
    chars['eom'] = pd.to_datetime(chars['eom'])
    daily_ret['date'] = pd.to_datetime(daily_ret['date'])

    #convert to polars for speed
    chars = pl.from_pandas(chars)
    daily_ret = pl.from_pandas(daily_ret)

    #prepare data
    chars = prepare_data(chars, features, 'eom')

    #select parameters for xgboost based on validation performance (validation analysis is removed here but is decribed in the documentation)
    xgb_params = {
        'booster': 'gbtree',
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'tree_method': 'hist',
        'random_state': 42,               # For reproducibility
        'verbosity': 0,
        'gamma': 0,                   # picked based on validation performance
        'lambda': 10.0,                # picked based on validation performance
        'alpha': 1.0,                 # picked based on validation performance
        'eta': 0.05,                   # picked based on validation performance
        'min_child_weight': 30,       # picked based on validation performance
        'max_depth': 4,               # picked based on validation performance
        'subsample': 0.7,             # picked based on validation performance
        'colsample_bytree': 0.5       # picked based on validation performance
    }
    num_rounds = 125

    # Adjust initial training period based on execution mode
    # Validation dataset is smaller, so use fewer initial years
    execution_mode = os.environ.get("CTF_EXECUTION_MODE", "full")
    if execution_mode == "validation":
        initial_train_years = 1  # Smaller for validation dataset
        print(f"[CTF-DEBUG] Validation mode: using {initial_train_years} year initial training period", flush=True)
    else:
        initial_train_years = 15  # Full 15 years for production
        print(f"[CTF-DEBUG] Full mode: using {initial_train_years} year initial training period", flush=True)

    #predict returns with anual retraining
    predict = run_rolling_annual_forecast_fixed(chars, feature_cols = features['features'], target_col = 'adjusted_ret', params = xgb_params, fixed_rounds = num_rounds, initial_train_years=initial_train_years)

    #merge 'ctff_test' and 'ret_exc_lead1m' column from chars into predict based on eom and id
    predict = predict.join(chars.select(['id', 'eom', 'ctff_test', 'ret_exc_lead1m', 'eom_ret']), on=['id', 'eom'], how='left')

    #prepare for optimizer only getting weights for the test set for less computational requirement
    unique_eoms = predict.filter(pl.col("ctff_test")).get_column("eom").unique().sort().to_list()
    all_portfolios = []

    #set constraints for optimizer to ensure balanced portfolios
    target_vol = 0.04 #picked based on validation performance
    min_stocks_perc_long = 0.1 #used to scale max weight to ensure at least 10% of stocks are held in long leg
    max_gross = 3.0 #max gross exposure to keep leverage within reasonable levels
    pred_col = 'prediction'

    #set max stocks as the optimizer crashes for the months with highest number of stocks
    max_stocks = 2000

    print(f"[CTF-DEBUG] Starting portfolio optimization at {time.strftime('%Y-%m-%d %H:%M:%S')}", flush=True)
    print(f"[CTF-DEBUG] Processing {len(unique_eoms)} months", flush=True)

    #loop through each month to get portfolio weights
    for eom in unique_eoms:
        start_time = time.time() #timing for progress tracking
        predict_subset = predict.filter(pl.col("eom") == eom) # Filter the predict DataFrame for the current eom
        num_stocks_initial = predict_subset.height
        if num_stocks_initial > max_stocks: #ensure that the optimizer never get universe larger than max_stocks to keep the optimisation from crashing
            # Sort stocks by their predicted return (ascending: worst to best)
            predict_subset = predict_subset.sort(pred_col)
            half_k = max_stocks // 2 # Calculate exactly how many is needed for each tail
            predict_subset = pl.concat([
                predict_subset.head(half_k),
                predict_subset.tail(half_k)
            ])
        cov_df = get_cov_for_eom(eom, predict_subset, daily_ret) #get the covariance matrix for this eom with 12 month lookback and ledoit wolf single index shrinkage estimator
        monthly_weights = generate_monthly_portfolio(cov_df, predict_subset, pred_col, target_vol, min_stocks_perc_long, max_gross) # Run the optimization to get the portfolio weights for this eom
        monthly_weights = monthly_weights.with_columns(pl.lit(eom).alias("eom")) # Add date
        all_portfolios.append(monthly_weights) # Store results
        elapsed = time.time() - start_time #timing for progress tracking
        print(f"[CTF-DEBUG] EOM: {eom} | Initial: {num_stocks_initial} | Optimized: {predict_subset.height} | Time: {elapsed:.1f} sec", flush=True)

    #combine it all
    portfolio_df = pl.concat(all_portfolios)

    #change back to panda and original id and date formats for output
    portfolio_df = portfolio_df.select([
        pl.col("id").cast(pl.Int64),                               # Force 'id' to be an integer
        pl.col("eom").cast(pl.Date).dt.to_string("%Y-%m-%d"),      # Force 'eom' to a YYYY-MM-DD string format
        pl.col("weight").alias("w")                                # Rename 'weight' to 'w'
    ])
    portfolio_df_pd = portfolio_df.to_pandas()

    print(f"[CTF-DEBUG] Completed at {time.strftime('%Y-%m-%d %H:%M:%S')}", flush=True)
    print(f"[CTF-DEBUG] Output shape: {portfolio_df_pd.shape[0]} rows, date range: {portfolio_df_pd['eom'].min()} to {portfolio_df_pd['eom'].max()}", flush=True)

    return portfolio_df_pd




#%%full code with hyperparameter tuning and validation analysis for transparency. Put in a "if False:" block to avoid running when submitting the code

if False:
    #%%additional packages needed for hyperparameter tuning
    import gc
    import os
    import itertools
    import random

    #%%load data
    os.chdir("put in your path here") #update path

    chars = pd.read_parquet("data/raw/ctff_chars.parquet")
    features = pd.read_parquet("data/raw/ctff_features.parquet")
    daily_ret = pd.read_parquet("data/raw/ctff_daily_ret.parquet")

    #%%functions used to hyper parameter tuning (still needs the functions above to run)

    def train_evaluate_fold(X_train, y_train, X_val, y_val, params, num_rounds=1000, early_stopping=50):
        """
        Trains a single XGBoost model and returns the best validation score.
        """
        # Create DMatrices (Optimized for Speed)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)

        # Train
        model = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=num_rounds,
            evals=[(dval, 'eval')],
            early_stopping_rounds=early_stopping,
            verbose_eval=False,
        )

        return model.best_score, model.best_iteration

    def run_cv_grid_search_xgb_polars(df, feature_cols, target_col, param_grid, base_params, n_splits=5, num_rounds=1000, early_stopping=50, min_years=15, n_iter=150, random_state=42):

        # --- 1. SETUP DATES & INDICES  ---
        # Sort by date ensures time monotonicity
        df = df.sort('eom')

        # Extract unique dates (sorted) directly to NumPy for logic
        unique_dates = df.select("eom").unique(maintain_order=True).to_series().to_numpy()

        # Extract the full date column for searchsorted (mapping rows to dates)
        dates_array = df.select("eom").to_numpy().flatten()

        # Check History
        min_months = min_years * 12
        if len(unique_dates) <= min_months:
            raise ValueError(f"History too short. Have {len(unique_dates)} months, need {min_months}.")

        # Calculate Fold Size (in months)
        testable_months = len(unique_dates) - min_months
        months_per_fold = testable_months // n_splits

        # --- 2. AUDIT: PRINT THE SCHEDULE ---
        print(f"\n{'='*80}")
        print(f"CROSS-VALIDATION SCHEDULE ({n_splits} Folds, Min Train={min_years}y)")
        print(f"{'='*80}")
        print(f"{'Fold':<5} | {'Train Start':<12} | {'Train End':<12} | {'Val Start':<12} | {'Val End':<12}")
        print(f"{'-'*80}")

        fold_slices = []

        for fold in range(n_splits):
            # Index math (finding the month integers)
            train_start_month_idx = 0
            train_end_month_idx   = min_months + (fold * months_per_fold) - 1
            val_start_month_idx   = train_end_month_idx + 1

            # Handle last fold remainder
            if fold == n_splits - 1:
                val_end_month_idx = len(unique_dates) - 1
            else:
                val_end_month_idx = val_start_month_idx + months_per_fold - 1

            # Get Actual Date Values (Polars usually returns standard Python dates or np.datetime64)
            d_train_start = unique_dates[train_start_month_idx]
            d_train_end   = unique_dates[train_end_month_idx]
            d_val_start   = unique_dates[val_start_month_idx]
            d_val_end     = unique_dates[val_end_month_idx]

            # Print Schedule
            print(f"{fold+1:<5} | {str(d_train_start)[:10]:<12} | {str(d_train_end)[:10]:<12} | {str(d_val_start)[:10]:<12} | {str(d_val_end)[:10]:<12}")

            # --- CRITICAL STEP: CONVERT DATES TO ROW INDICES ---
            # use searchsorted on the full 'dates_array' to find where these dates sit in the DataFrame

            # 1. Train End Index: The insertion point of the FIRST date of Validation
            # Everything BEFORE this index is Training data
            train_idx_end = np.searchsorted(dates_array, d_val_start)

            # 2. Val End Index: The insertion point of the NEXT month after Validation ends
            if val_end_month_idx == len(unique_dates) - 1:
                val_idx_end = len(df) # Take everything to the end
            else:
                # Find the start of the month *after* this validation period
                next_month_date = unique_dates[val_end_month_idx + 1]
                val_idx_end = np.searchsorted(dates_array, next_month_date)

            fold_slices.append((train_idx_end, val_idx_end))

        print(f"{'='*80}\n")

        # --- 3. PREPARE DATA (Convert Polars -> NumPy ONCE) ---
        # Slicing NumPy is faster than slicing Polars repeatedly.
        X_all = df.select(feature_cols).to_numpy()
        y_all = df.select(target_col).to_numpy()

        # --- 4. GRID SEARCH LOOP ---
        keys, values = zip(*param_grid.items())

        # 1. Generate ALL possible combinations
        all_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

        # 2. Randomly Sample 'n_iter' of them
        if n_iter < len(all_combinations):
            random.seed(random_state) # Set seed for reproducibility
            param_combinations = random.sample(all_combinations, n_iter)
            print(f"Random Search: Selected {n_iter} random combos out of {len(all_combinations)} total.")
        else:
            # If n_iter exceeds total combinations, just use the full grid (Grid Search)
            param_combinations = all_combinations
            print(f"Grid is smaller than n_iter. Running full grid ({len(all_combinations)} combos).")
        results = []

        print(f"Starting Search: {len(param_combinations)} combos x {n_splits} splits...")

        for i, params in enumerate(param_combinations):
            current_params = base_params.copy()
            current_params.update(params)
            fold_scores = []
            fold_iters = []

            for fold, (train_end_idx, val_end_idx) in enumerate(fold_slices):

                # Slice NumPy arrays (Fast & Zero Copy Views)
                X_train = X_all[:train_end_idx]
                y_train = y_all[:train_end_idx]

                X_val   = X_all[train_end_idx:val_end_idx]
                y_val   = y_all[train_end_idx:val_end_idx]

                try:
                    # train model and get validation score for this fold
                    score, best_iter = train_evaluate_fold(X_train, y_train, X_val, y_val, current_params, num_rounds, early_stopping)
                    fold_scores.append(score)
                    fold_iters.append(best_iter)
                except Exception as e:
                    print(f"Error in fold {fold}: {e}")
                    pass

            # E. Aggregate Results
            avg_score = np.nanmean(fold_scores)
            std_score = np.nanstd(fold_scores)
            median_best_iter = int(np.nanmedian(fold_iters))
            mean_best_iter   = int(np.nanmean(fold_iters))

            # Save
            result_row = params.copy()
            result_row['avg_rmse'] = avg_score
            result_row['std_rmse'] = std_score
            result_row['median_best_iter'] = median_best_iter
            result_row['mean_best_iter']   = mean_best_iter
            results.append(result_row)

            #Print progress
            print(f"Combo {i+1}: RMSE={avg_score:.5f} ± {std_score:.5f}")

        return pd.DataFrame(results).sort_values('avg_rmse')

    #%%prepare data
    #ensure it is datetime
    chars['eom'] = pd.to_datetime(chars['eom'])
    daily_ret['date'] = pd.to_datetime(daily_ret['date'])

    #convert to polars for speed
    chars = pl.from_pandas(chars)
    daily_ret = pl.from_pandas(daily_ret)

    #prepare data
    chars = prepare_data(chars, features, 'eom')

    #%%tune hyper parameters xg boost

    #hyperparameter tuning
    if False:
        #stage one
        param_grid = {
            'max_depth': [3, 4, 5],              # Tree Depth (Complexity)
            'subsample': [0.3, 0.5, 0.7],        # % of rows used per tree
            'colsample_bytree': [0.3, 0.5, 0.7], # % of columns used per tree
            'min_child_weight': [10, 30],        # Minimum samples to split
            'eta': [0.01, 0.05, 0.1],          # Learning Rate (Shrinkage)
            'gamma': [0.0, 1.0, 5.0],            # Minimum loss reduction to split
            'lambda': [1.0, 5.0, 10.0],          # L2 Regularization (Ridge)
            'alpha': [0.0, 0.1, 1.0]             # L1 Regularization (Lasso)
        }

        base_params = {
            'booster': 'gbtree',
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'random_state': 42,                # For reproducibility
            'verbosity': 0,
            'tree_method': 'hist',
            'n_jobs': -1
        }

        df_results_stage_one = run_cv_grid_search_xgb_polars(chars.filter(~pl.col("ctff_test")), features['features'], 'adjusted_ret', param_grid, base_params)

        #stage two
        param_grid = {
            'max_depth': [3, 4, 5],              # Tree Depth (Complexity)
            'eta': [0.01, 0.05],               # Learning Rate (Shrinkage)
            'lambda': [1.0, 5.0, 10.0],       # L2 Regularization (Ridge)
            'alpha': [0.0, 0.1, 1.0]               # L1 Regularization (Lasso)
        }

        base_params = {
            'booster': 'gbtree',
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'random_state': 42,                # For reproducibility
            'verbosity': 0,
            'tree_method': 'hist',
            'n_jobs': -1,
            'gamma': 0.0,            # Minimum loss reduction to split
            'subsample': 0.7,        # % of rows used per tree
            'colsample_bytree': 0.5, # % of columns used per tree
            'min_child_weight': 30       # Minimum samples to split
        }


        df_results_stage_two = run_cv_grid_search_xgb_polars(chars.filter(~pl.col("ctff_test")), features['features'], 'adjusted_ret', param_grid, base_params)


    #choose parameters based on results from stage 2 (picked the less complex model of the best performing ones)
    xgb_params = {
        'booster': 'gbtree',
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'tree_method': 'hist',
        'random_state': 42,               # For reproducibility
        'verbosity': 0,
        'gamma': 0,                   # picked based on validation performance
        'lambda': 10.0,                # picked based on validation performance
        'alpha': 1.0,                 # picked based on validation performance
        'eta': 0.05,                   # picked based on validation performance
        'min_child_weight': 30,       # picked based on validation performance
        'max_depth': 4,               # picked based on validation performance
        'subsample': 0.7,             # picked based on validation performance
        'colsample_bytree': 0.5       # picked based on validation performance
    }
    num_rounds = 125 #preffered model had an average of 127 num_rounds but lower median. Rounded to 125 for simplicity

    #%%predict returns

    #predict returns with anual retraining
    predict = run_rolling_annual_forecast_fixed(chars, feature_cols = features['features'], target_col = 'adjusted_ret', params = xgb_params, fixed_rounds = num_rounds, initial_train_years=15)

    #merge 'ctff_test' and 'ret_exc_lead1m' column from chars into predict based on eom and id
    predict = predict.join(chars.select(['id', 'eom', 'ctff_test', 'ret_exc_lead1m', 'eom_ret']), on=['id', 'eom'], how='left')

    #check if all merging has happened correctly
    check = predict.filter(pl.col("ret").fill_null(0) != pl.col("ret_exc_lead1m").fill_null(0))

    #prepare for optimizer only using the test set for less computational requirement
    unique_eoms = predict.filter(pl.col("ctff_test")).get_column("eom").unique().sort().to_list()
    all_portfolios = []

    #%%running hyperparameter test for vol-targets and checking sensitivity
    if False:
        #get list of dates to iterate over where ctff_test is False
        unique_eoms = predict.filter(~pl.col("ctff_test")).get_column("eom").unique().sort().to_list()

        vol_targets = [0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08]

        # set other static parameters
        min_stocks_perc_long = 0.1  #
        max_gross = 3.0
        pred_col = 'prediction'

        # Master results list
        all_vol_results = []

        for eom in unique_eoms:
            start_time = time.time()
            print(f"--- Processing EOM: {eom} ---")

            # 1. Get Covariance Matrix
            cov_df = get_cov_for_eom(eom, predict, daily_ret)

            # 2. Filter predictions once
            predict_subset = predict.filter(pl.col("eom") == eom)

            # 3. Inner Loop: Test each Vol Target
            for vol in vol_targets:
                try:
                    # Run the optimizer with the specific vol target
                    monthly_weights = generate_monthly_portfolio(
                        cov_df,
                        predict_subset,
                        pred_col,
                        target_vol=vol,
                        min_stocks_perc_long=min_stocks_perc_long,
                        max_gross=max_gross
                    )

                    # Tag with the EOM and the specific Vol Target for later grouping
                    monthly_weights = monthly_weights.with_columns([
                        pl.lit(eom).alias("eom"),
                        pl.lit(vol).alias("target_vol_setting")
                    ])

                    all_vol_results.append(monthly_weights)

                except Exception as e:
                    # If the solver fails for a specific vol (e.g. 0.03 is too tight), log and continue
                    print(f"  Vol {vol} failed for {eom}: {e}")
                    continue
            num_stocks = predict_subset.shape[0]
            elapsed = time.time() - start_time
            print(f"✅ EOM: {eom} | Stocks: {num_stocks} | Time: {elapsed:.1f} sec")
            try:
                del cov_df
                del predict_subset
                del monthly_weights
            except NameError:
                pass
            gc.collect() #clear memory


        # Combine everything into one master sensitivity DataFrame
        sensitivity_df = pl.concat(all_vol_results)

        # 1. Join with returns (keeping your date/ID logic)
        perf_df = sensitivity_df.join(
            chars.select(['id', 'eom', 'ret_exc_lead1m'])
            .with_columns([
                pl.col("id").cast(pl.String),
                pl.col("eom").cast(pl.Datetime("us"))
            ]),
            on=['id', 'eom'],
            how='left'
        )

        # 2. Calculate return contribution per stock
        perf_df = perf_df.with_columns(
            pf_ret_con = pl.col("weight") * pl.col("ret_exc_lead1m")
        )

        # 3. Aggregate to Monthly Level
        # Group by Vol Target and Month to get the strategy's monthly return stream
        monthly_perf = (
            perf_df.group_by(["target_vol_setting", "eom"])
            .agg(monthly_ret = pl.col("pf_ret_con").sum())
        )

        # 4. Final Performance Table
        # Annualizing based on 12 months per year
        sharpe_summary = (
            monthly_perf.group_by("target_vol_setting")
            .agg([
                pl.col("monthly_ret").mean().alias("avg_monthly_ret"),
                pl.col("monthly_ret").std().alias("std_monthly_ret")
            ])
            .with_columns(
                ann_return = pl.col("avg_monthly_ret") * 12,
                ann_vol = pl.col("std_monthly_ret") * (12 ** 0.5)
            )
            .with_columns(
                sharpe_ratio = pl.col("ann_return") / pl.col("ann_vol")
            )
            .sort("target_vol_setting")
        )

        print(sharpe_summary)

    #prepare for optimizer only using the test set for less computational requirement
    unique_eoms = predict.filter(pl.col("ctff_test")).get_column("eom").unique().sort().to_list()
    all_portfolios = []

    #set constraints for optimizer to ensure balanced portfolios
    target_vol = 0.04 #picked based on validation performance
    min_stocks_perc_long = 0.1 #used to scale max weight to ensure at least 10% of stocks are held in long leg
    max_gross = 3.0
    pred_col = 'prediction'

    #set max stocks as the optimizer crashes for the months with highest number of stocks
    max_stocks = 2000

    #loop through each month to get portfolio weights
    for eom in unique_eoms:
        start_time = time.time() #timing for progress tracking
        predict_subset = predict.filter(pl.col("eom") == eom) # Filter the predict DataFrame for the current eom
        num_stocks_initial = predict_subset.height
        if num_stocks_initial > max_stocks: #ensure that the optimizer never get universe larger than max_stocks to keep the optimisation from crashing
            # Sort stocks by their predicted return (ascending: worst to best)
            predict_subset = predict_subset.sort(pred_col)
            half_k = max_stocks // 2 # Calculate exactly how many we need for each tail
            predict_subset = pl.concat([
                predict_subset.head(half_k),
                predict_subset.tail(half_k)
            ])
        cov_df = get_cov_for_eom(eom, predict_subset, daily_ret) #get the covariance matrix for this eom with 12 month lookback and ledoit wolf single index shrinkage estimator
        monthly_weights = generate_monthly_portfolio(cov_df, predict_subset, pred_col, target_vol, min_stocks_perc_long, max_gross) # Run the optimization to get the portfolio weights for this eom
        monthly_weights = monthly_weights.with_columns(pl.lit(eom).alias("eom")) # Add date
        all_portfolios.append(monthly_weights) # Store results
        elapsed = time.time() - start_time #timing for progress tracking
        print(f"✅ EOM: {eom} | Initial: {num_stocks_initial} | Optimized: {predict_subset.height} | Time: {elapsed:.1f} sec") #print progress

    #combine it all
    portfolio_df = pl.concat(all_portfolios)

    #change back to panda and original id and date formats for output
    portfolio_df = portfolio_df.select([
        pl.col("id").cast(pl.Int64),                               # Force 'id' to be an integer
        pl.col("eom").cast(pl.Date).dt.to_string("%Y-%m-%d"),      # Force 'eom' to a YYYY-MM-DD string format
        pl.col("weight").alias("w")                                # Rename 'weight' to 'w'
    ])
    portfolio_df_pd = portfolio_df.to_pandas()
