"""CTF-compatible submission: Ridge + XGBoost ensemble for cross-sectional return prediction.

This module implements a hybrid machine learning approach combining Ridge regression
and XGBoost for predicting cross-sectional stock returns. The model uses cross-sectional
z-score normalization and produces dollar-neutral portfolio weights.

Original submission by: Thanh-Tuan Le (FINN43815 Group 16)
Modified for CTF pipeline compatibility.

Changes made from original submission (FINN43815_Group16.py):
=============================================================
1. WRAPPED IN main() FUNCTION: Original code executed at module level, reading
   data files directly. Now wrapped in main(chars, features, daily_ret) function
   that receives data as parameters per CTF pipeline requirements.

2. REMOVED MODULE-LEVEL EXECUTION: Original file ran training, backtesting, and
   plotting immediately on import. All execution now happens only when main() is called.

3. REMOVED BACKTESTING CODE: Original file included extensive backtesting,
   hyperparameter tuning, diagnostics, and visualization code (Sections 8-17).
   Only the core model training and prediction logic is retained.

4. REMOVED UNUSED IMPORTS: Removed matplotlib, statsmodels, collections, pathlib
   which were only used for backtesting/visualization.

5. OUTPUT FORMAT: Added conversion of model predictions to portfolio weights
   in the required format: DataFrame with columns ['id', 'eom', 'w'].
   Uses dollar-neutral weighting (long high predictions, short low predictions).

6. SIMPLIFIED MODEL: Uses the V3 baseline model (Ridge + XGBoost ensemble)
   without the V4 feature neutralization, as this provides a good balance
   of performance and simplicity. The ensemble blends 50% Ridge + 50% XGBoost.

Core methodology retained from original:
- Cross-sectional z-score normalization (xsec_z function)
- Winsorization to ±3 standard deviations
- Training set-based imputation parameters (1st/99th percentile bounds, median fill)
- Ridge regression with alpha=100 and winsorized targets
- XGBoost with 150 trees, max_depth=5, learning_rate=0.05

Main Function:
    main: Entry point that receives CTF data, trains models, and returns portfolio weights
"""

import gc
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
import xgboost as xgb


def xsec_z(g: pd.DataFrame) -> pd.DataFrame:
    """Cross-sectional z-score by month.

    Standardizes features within each cross-section (month) to have
    zero mean and unit standard deviation.
    """
    m = g.mean()
    s = g.std(ddof=0)
    s = s.replace(0, 1.0)
    return (g - m) / (s + 1e-12)


def apply_winsor_impute(X_raw: pd.DataFrame,
                        L: pd.Series,
                        U: pd.Series,
                        med: pd.Series) -> pd.DataFrame:
    """Apply winsorization and imputation using training set parameters.

    Args:
        X_raw: Raw feature DataFrame
        L: Lower bounds (1st percentile from training)
        U: Upper bounds (99th percentile from training)
        med: Median values for imputation (from training)

    Returns:
        Preprocessed feature DataFrame
    """
    X = X_raw.clip(lower=L, upper=U, axis=1)
    X = X.fillna(med)
    X = X.clip(-3.0, 3.0)
    return X.astype(np.float32)


def main(chars: pd.DataFrame, features: pd.DataFrame, daily_ret: pd.DataFrame) -> pd.DataFrame:
    """Main function to train model and calculate portfolio weights.

    Implements a Ridge + XGBoost ensemble model with cross-sectional
    z-score normalization and dollar-neutral portfolio construction.

    Args:
        chars (pd.DataFrame): DataFrame containing characteristics data.
        features (pd.DataFrame): DataFrame with 'features' column containing feature names.
        daily_ret (pd.DataFrame): DataFrame containing daily returns data (not used).

    Returns:
        pd.DataFrame: DataFrame with columns 'id', 'eom', and 'w'.
    """
    # Extract feature list from features DataFrame
    feature_list = features['features'].tolist()

    # Convert date column
    chars = chars.copy()
    chars['eom'] = pd.to_datetime(chars['eom'])

    # Keep only features that exist in chars
    feature_list = [c for c in feature_list if c in chars.columns]

    # Drop rows with missing target (for training data)
    df = chars.dropna(subset=['ret_exc_lead1m']).copy()

    # Replace inf with NaN
    df[feature_list] = df[feature_list].replace([np.inf, -np.inf], np.nan)

    # Cross-sectional z-score by month
    df[feature_list] = df.groupby('eom', group_keys=False)[feature_list].apply(xsec_z)

    # Winsorize to reduce impact of extreme outliers
    df[feature_list] = df[feature_list].clip(lower=-3.0, upper=3.0)

    # Fill remaining NaNs with 0
    df[feature_list] = df[feature_list].replace([np.inf, -np.inf], np.nan).fillna(0.0)

    # Split into train and test based on ctff_test flag
    train_df = df[df['ctff_test'] == False].copy()
    test_df = df[df['ctff_test'] == True].copy()

    if len(train_df) == 0 or len(test_df) == 0:
        # Fallback: equal weights if no valid data
        test_only = chars[chars['ctff_test'] == True][['id', 'eom']].copy()
        test_only['w'] = 1.0 / test_only.groupby('eom')['id'].transform('count')
        return test_only[['id', 'eom', 'w']]

    # Get training targets
    y_train = train_df['ret_exc_lead1m'].astype(np.float32)

    # Winsorize y for Ridge robustness
    y_limit = y_train.std() * 5.0
    y_train_ridge = y_train.clip(-y_limit, y_limit).astype(np.float32)

    # Get training set parameters for preprocessing
    X_train_raw = train_df[feature_list].copy()
    L_BOUND = X_train_raw.quantile(0.01)
    U_BOUND = X_train_raw.quantile(0.99)
    IMPUTE_VALS = X_train_raw.median()

    # Apply preprocessing to train and test
    X_train = apply_winsor_impute(X_train_raw, L_BOUND, U_BOUND, IMPUTE_VALS)
    X_test = apply_winsor_impute(test_df[feature_list].copy(), L_BOUND, U_BOUND, IMPUTE_VALS)

    # Final NaN check - fill any remaining NaN
    X_train = X_train.fillna(0.0)
    X_test = X_test.fillna(0.0)

    # Train Ridge model
    ridge_model = Ridge(alpha=100.0, random_state=42)
    ridge_model.fit(X_train, y_train_ridge)

    # Train XGBoost model
    xgb_model = xgb.XGBRegressor(
        n_estimators=150,
        max_depth=5,
        learning_rate=0.05,
        reg_lambda=1.0,
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1,
        verbosity=0
    )
    xgb_model.fit(X_train, y_train)

    # Ensemble predictions (50/50 blend)
    test_df = test_df.copy()
    pred_ridge = ridge_model.predict(X_test)
    pred_xgb = xgb_model.predict(X_test)
    test_df['pred'] = 0.5 * pred_ridge + 0.5 * pred_xgb

    # Convert predictions to portfolio weights
    # Cross-sectional rank to [0,1]
    test_df['rank'] = test_df.groupby('eom')['pred'].rank(pct=True)

    # Dollar-neutral weights: demean scores and normalize
    test_df['w'] = test_df.groupby('eom')['rank'].transform(lambda x: x - x.mean())
    test_df['w'] = test_df.groupby('eom')['w'].transform(
        lambda x: x / x.abs().sum() * 2 if x.abs().sum() > 0 else 0
    )

    # Handle any remaining NaN in weights
    test_df['w'] = test_df['w'].fillna(0.0)

    # Clean up memory
    del X_train, X_test, ridge_model, xgb_model
    gc.collect()

    # Return required columns only
    return test_df[['id', 'eom', 'w']].copy()


if __name__ == '__main__':
    # This block is NOT used in the CTF infrastructure
    # The CTF infrastructure calls main() directly
    # This block is kept for local testing only

    # For local testing: load data directly
    chars = pd.read_parquet('/data/ctff_chars.parquet')
    features = pd.read_parquet('/data/ctff_features.parquet')
    daily_ret = pd.read_parquet('/data/ctff_daily_ret.parquet')

    # Run the model
    output = main(chars, features, daily_ret)

    # Write output
    output.to_csv('/outputs/output.csv', index=False)
    print('Model execution completed')