Source code for pyvallocation.utils.data_helpers

"""Helpers for converting between pandas and NumPy structures."""

from __future__ import annotations

from typing import List, Optional

import numpy as np
import pandas as pd


[docs] def pandas_to_numpy_returns( dataframe: "pd.DataFrame", price_columns: Optional[List[str]] = None, date_column: Optional[str] = None, return_calculation_method: str = "log", fill_na_method: str = "ffill", ) -> np.ndarray: """Convert a pandas DataFrame of prices to a numpy array of returns. Args: dataframe: Price data as a DataFrame. price_columns: Optional list of columns to treat as prices. date_column: Optional column name to set as index. return_calculation_method: ``"log"`` or ``"simple"`` (default ``"log"``). fill_na_method: Missing data handling (default ``"ffill"``). Returns: np.ndarray: Return matrix with shape ``(T-1, N)``. """ if not isinstance(dataframe, pd.DataFrame): raise ValueError("`dataframe` must be a pandas DataFrame.") df = dataframe.copy() if date_column: if df.index.name != date_column: if date_column in df.columns: try: df[date_column] = pd.to_datetime(df[date_column]) df = df.set_index(date_column) except Exception as e: raise ValueError( f"Failed to set date column '{date_column}' as index: {e}" ) else: raise ValueError( f"Date column '{date_column}' not found in DataFrame columns or as index name." ) # else: index name matches date_column, no action needed if df.index.nlevels > 1: # Handle MultiIndex if present after set_index df = df.reset_index(level=list(range(1, df.index.nlevels)), drop=True) if price_columns: missing_cols = [col for col in price_columns if col not in df.columns] if missing_cols: raise ValueError(f"Price columns not found in DataFrame: {missing_cols}") price_data = df[price_columns] else: price_data = df.select_dtypes(include=np.number) if price_data.empty: raise ValueError("No numeric columns found for price data.") calc_map = { "log": lambda x: np.log(x / x.shift(1)), "simple": lambda x: x.pct_change(fill_method=None), } try: returns = calc_map[return_calculation_method](price_data) except KeyError as exc: raise ValueError( "`return_calculation_method` must be 'log' or 'simple'." ) from exc returns = returns.iloc[1:] # First row will be NaN due to shift/pct_change fill_map = { "ffill": lambda x: x.ffill(), "bfill": lambda x: x.bfill(), "zero": lambda x: x.fillna(0), "drop": lambda x: x.dropna(), } try: returns = fill_map[fill_na_method](returns) except KeyError as exc: raise ValueError( "`fill_na_method` must be 'ffill', 'bfill', 'zero', or 'drop'." ) from exc returns = returns.fillna(0) # Fill any remaining NaNs with 0 after chosen method return returns.to_numpy()
[docs] def numpy_weights_to_pandas_series( weights: np.ndarray, asset_names: List[str] ) -> "pd.Series": """Convert a 1D numpy array of weights to a pandas Series with asset names as index. Args: weights: Weight vector of shape ``(N,)``. asset_names: Asset name list of length ``N``. Returns: pd.Series: Weights indexed by asset name. """ if not isinstance(weights, np.ndarray) or weights.ndim != 1: raise ValueError("`weights` must be a 1D NumPy array.") if not isinstance(asset_names, list) or not all( isinstance(name, str) for name in asset_names ): raise ValueError("`asset_names` must be a list of strings.") if weights.shape[0] != len(asset_names): raise ValueError("Length of `weights` must match length of `asset_names`.") return pd.Series(weights, index=asset_names)