"""
CTF Submission: Elastic Net Feature Selection + XGBoost Return Prediction
Rolling window (120 months), refit every 12 months

CTF Admin Modifications (2026-02-17):
--------------------------------------
1. Created requirements.txt with xgboost>=2.0.0
   Reason: xgboost is not pre-installed in the HPC environment.

2. Added flush=True to all print statements
   Reason: Ensure output is visible in containerized HPC environment.

3. Added [CTF-DEBUG] prefix to progress statements
   Reason: Enable HPC job monitoring with consistent log parsing.

4. Added test period filtering (ctff_test == True) to main() output
   Reason: Original code only filtered in __main__ block which doesn't run
           in pipeline. Without filtering, output exceeds 150MB limit.
"""

import time

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import ElasticNetCV
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


def _prepare_features(df: pd.DataFrame, feat_cols: list) -> pd.DataFrame:
    """Impute then ECDF-transform all feature columns cross-sectionally."""
    df = df.copy()
    present = [c for c in feat_cols if c in df.columns]
    
    df[present] = df.groupby('eom')[present].transform(
        lambda x: x.fillna(x.median())
    )
    
    df[present] = df.groupby('eom')[present].transform(
        lambda x: x.rank(method='average', pct=True)
    )
    
    return df


def _select_features(train_df: pd.DataFrame, feat_cols: list, top_n: int = 50) -> list:
    """
    Run ElasticNetCV on training data and return the top_n features
    ranked by |coefficient|.
    """
    target = 'ret_exc_lead1m'
    present = [c for c in feat_cols if c in train_df.columns]
    sub = train_df[present + [target]].dropna(subset=[target])
    
    X = sub[present].dropna(axis=1, how='all').fillna(0.5)
    good_feats = X.columns.tolist()
    y = sub[target].values
    
    X_scaled = StandardScaler().fit_transform(X.values)
    
    en = ElasticNetCV(
        l1_ratio=[0.5, 0.9, 1.0],
        cv=3,
        max_iter=1000,
        n_jobs=-1,
        random_state=42
    )
    en.fit(X_scaled, y)
    
    coefs = pd.Series(np.abs(en.coef_), index=good_feats)
    selected = coefs[coefs > 0].nlargest(top_n).index.tolist()
    
    # if none selected, use top_n by variance
    if len(selected) == 0:
        print(f'[CTF-DEBUG] EN selected 0 features — falling back to top {top_n} by variance', flush=True)
        variances = pd.Series(X.var().values, index=good_feats)
        selected = variances.nlargest(top_n).index.tolist()
    else:
        print(f'[CTF-DEBUG] EN selected {len(selected)} features '
              f'(l1_ratio={en.l1_ratio_:.2f}, alpha={en.alpha_:.5f})', flush=True)
    
    return selected


def _fit_xgb(train_df: pd.DataFrame, sel_feats: list) -> xgb.Booster:
    """Train XGBoost model on selected features."""
    target = 'ret_exc_lead1m'
    sub = train_df[sel_feats + [target]].dropna(subset=[target])
    dtrain = xgb.DMatrix(sub[sel_feats], label=sub[target])
    
    params = {
        'booster':          'gbtree',
        'eta':              0.05,
        'max_depth':        4,
        'subsample':        0.5,
        'colsample_bytree': 0.5,
        'min_child_weight': 20,
        'objective':        'reg:squarederror',
        'verbosity':        0,
        'seed':             42
    }
    
    return xgb.train(params, dtrain, num_boost_round=200)


def _rank_weights(pred_series: pd.Series) -> pd.Series:
    """
    Rank-based long-short weights:
      w_i = (rank_i - mean_rank) / sum(|rank_i - mean_rank|) * 2
    Sums to 0 (dollar-neutral), scales so sum of |w| = 2
    """
    ranks = pred_series.rank(ascending=True, method='average')
    demeaned = ranks - ranks.mean()
    total_abs = demeaned.abs().sum()
    if total_abs == 0:
        return pd.Series(0.0, index=pred_series.index)
    return demeaned / total_abs * 2



def main(chars: pd.DataFrame,
         features: pd.DataFrame,
         daily_ret: pd.DataFrame) -> pd.DataFrame:
    """
    Elastic Net feature selection + XGBoost return prediction.
    Rolling 120-month window; refit every 12 months.
    
    Args:
        chars: Stock characteristics (ctff_chars.parquet)
        features: Computed features (ctff_features.parquet)
        daily_ret: Historical daily returns (ctff_daily_ret.parquet)

    Returns:
        DataFrame with columns: id, eom, w
    """
    np.random.seed(42)
    
    TARGET         = 'ret_exc_lead1m'
    REFIT_FREQ     = 12
    MIN_MONTHS     = 36
    TOP_N          = 50
    ROLLING_WINDOW = 120
    
    feat_cols = features['features'].tolist()
    
    # Preprocess features
    # CTF-FIX: Add start timestamp for HPC monitoring
    print(f"[CTF-DEBUG] Starting main() at {time.strftime('%Y-%m-%d %H:%M:%S')}", flush=True)
    start_time = time.time()

    print('[CTF-DEBUG] Preprocessing features...', flush=True)
    chars = _prepare_features(
        chars.sort_values('eom').reset_index(drop=True),
        feat_cols
    )
    
    all_dates = chars['eom'].unique()
    print(f'[CTF-DEBUG] Total months: {len(all_dates)}', flush=True)
    
    all_results = []
    current_feats = None
    current_model = None
    last_refit_idx = -9999
    
    for t_idx, t_date in enumerate(all_dates):
        # rolling window
        start_idx = max(0, t_idx - ROLLING_WINDOW)
        start_date = all_dates[start_idx]
        train_mask = (chars['eom'] >= start_date) & (chars['eom'] < t_date)
        test_mask  = chars['eom'] == t_date
        
        n_train_months = chars.loc[train_mask, 'eom'].nunique()
        if n_train_months < MIN_MONTHS:
            continue
        
        train_df = chars[train_mask]
        test_df  = chars[test_mask].copy()
        
        # feature selection
        if current_feats is None or (t_idx - last_refit_idx) >= REFIT_FREQ:
            print(f'[CTF-DEBUG] [{t_date}] Refitting EN+XGB on {n_train_months} months '
                  f'(window: {start_date} to {t_date})...', flush=True)
            current_feats = _select_features(train_df, feat_cols, TOP_N)
            last_refit_idx = t_idx
        
        if len(current_feats) == 0:
            print(f'[CTF-DEBUG] [{t_date}] No features selected; skipping.', flush=True)
            continue
        
        # XGBoost training 
        if current_model is None or (t_idx - last_refit_idx) == 0:
            current_model = _fit_xgb(train_df, current_feats)
        
        # weights for test month
        feat_ok = [f for f in current_feats if f in test_df.columns]
        dtest = xgb.DMatrix(test_df[feat_ok].fillna(0.5))
        test_df['pred'] = current_model.predict(dtest)
        test_df['w']    = _rank_weights(test_df['pred'])
        
        all_results.append(test_df[['id', 'eom', 'w']])
    

    output = pd.concat(all_results, ignore_index=True)

    # CTF-FIX: Filter to test period only (ctff_test == True)
    # Original code only filtered in __main__ block which doesn't run in pipeline
    test_dates = chars.loc[chars['ctff_test'] == True, 'eom'].unique()
    output = output[output['eom'].isin(test_dates)]

    # CTF-FIX: Add completion summary for HPC monitoring
    elapsed = time.time() - start_time
    print(f"[CTF-DEBUG] Completed main() in {elapsed:.1f}s", flush=True)
    print(f"[CTF-DEBUG] Output: {len(output)} rows, {output['eom'].nunique()} unique months", flush=True)

    return output[['id', 'eom', 'w']]

if __name__ == '__main__':

    DATA_DIR = '/Users/duruunsal/Documents/stanford 2025-2026/winter 2026/datasci 194c/hw4/194c_assignment 4/'
    
    print('Loading data...')
    chars = pd.read_parquet(f'{DATA_DIR}/ctff_chars.parquet')
    features = pd.read_csv(f'{DATA_DIR}/ctff_features.csv')
    daily_ret = pd.read_parquet(f'{DATA_DIR}/ctff_daily_ret.parquet')
    
    print('Running main()...')
    results_df = main(chars, features, daily_ret)
    
    test_flag = chars[['id', 'eom', 'ctff_test']].drop_duplicates()
    submission = results_df.merge(test_flag, on=['id', 'eom'], how='left')
    submission_ctf = (
        submission[submission['ctff_test'] == True][['id', 'eom', 'w']]
        .sort_values(['eom', 'id'])
        .reset_index(drop=True)
    )
    

    submission_ctf.to_csv('ctf_submission_weights.csv', index=False)
    print(f'Saved {len(submission_ctf)} rows to ctf_submission_weights.csv')
    

    monthly_rets = (
        results_df.merge(chars[['id', 'eom', 'ret_exc_lead1m']], on=['id', 'eom'])
        .groupby('eom')
        .apply(lambda x: (x['w'] * x['ret_exc_lead1m']).sum())
    )
    sharpe = np.sqrt(12) * monthly_rets.mean() / monthly_rets.std()
    print(f'Estimated Sharpe: {sharpe:.3f}')