|
|
|
|
|
import gc |
|
|
import time |
|
|
import os |
|
|
import shutil |
|
|
import torch |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import json |
|
|
import re |
|
|
from datetime import datetime, timedelta |
|
|
|
|
|
|
|
|
from langchain_community.vectorstores import Chroma |
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
from langchain_huggingface import HuggingFacePipeline |
|
|
from langchain_classic.chains import RetrievalQA |
|
|
from langchain_classic.prompts import PromptTemplate |
|
|
from langchain_classic.docstore.document import Document |
|
|
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig |
|
|
|
|
|
|
|
|
|
|
|
HF_GENERATION_MODEL = "Qwen/Qwen2.5-1.5B-Instruct" |
|
|
|
|
|
|
|
|
llm_pipeline_hf_instance = None |
|
|
|
|
|
|
|
|
def extract_clean_json(response_text): |
|
|
"""Robust JSON extractor handling Python booleans and Markdown.""" |
|
|
json_match = re.search(r'```json\s*(.*?)\s*```', response_text, re.DOTALL) |
|
|
if json_match: |
|
|
text_to_parse = json_match.group(1) |
|
|
else: |
|
|
start_idx = response_text.find('{') |
|
|
end_idx = response_text.rfind('}') |
|
|
if start_idx != -1 and end_idx != -1: |
|
|
text_to_parse = response_text[start_idx:end_idx+1] |
|
|
else: |
|
|
|
|
|
return None |
|
|
|
|
|
text_to_parse = text_to_parse.replace(": True", ": true").replace(": False", ": false") |
|
|
|
|
|
try: |
|
|
return json.loads(text_to_parse) |
|
|
except json.JSONDecodeError: |
|
|
return None |
|
|
|
|
|
|
|
|
def setup_llm_pipeline(): |
|
|
global llm_pipeline_hf_instance |
|
|
if llm_pipeline_hf_instance is None: |
|
|
print(f"--- Loading Model: {HF_GENERATION_MODEL} ---") |
|
|
tokenizer = AutoTokenizer.from_pretrained(HF_GENERATION_MODEL, trust_remote_code=True) |
|
|
|
|
|
bnb_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, bnb_4bit_use_double_quant=True, |
|
|
bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16 |
|
|
) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
HF_GENERATION_MODEL, trust_remote_code=True, |
|
|
quantization_config=bnb_config, device_map="auto" |
|
|
) |
|
|
|
|
|
pipe = pipeline( |
|
|
"text-generation", model=model, tokenizer=tokenizer, |
|
|
max_new_tokens=1024, |
|
|
do_sample=False, temperature=0.1, |
|
|
return_full_text=False |
|
|
) |
|
|
llm_pipeline_hf_instance = HuggingFacePipeline(pipeline=pipe) |
|
|
return llm_pipeline_hf_instance |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_historical_segment(df_segment, selected_assets, period_name): |
|
|
""" |
|
|
Analyzes a specific segment of historical data directly without RAG. |
|
|
Takes a DataFrame slice, calculates summary stats, and prompts the LLM. |
|
|
""" |
|
|
llm = setup_llm_pipeline() |
|
|
print(f"--- Running Historical Analysis for {period_name} ---") |
|
|
|
|
|
|
|
|
if df_segment.empty: |
|
|
return "No data available for this period to analyze." |
|
|
|
|
|
start_date = df_segment.index.min().date() |
|
|
end_date = df_segment.index.max().date() |
|
|
|
|
|
start_vals = df_segment.iloc[0] |
|
|
end_vals = df_segment.iloc[-1] |
|
|
|
|
|
pct_changes = ((end_vals - start_vals) / (start_vals.replace(0, np.nan)) * 100).fillna(0) |
|
|
|
|
|
|
|
|
data_summary = f"Analysis Period: {period_name} ({start_date} to {end_date})\n\n" |
|
|
data_summary += "Performance Summary over Period:\n" |
|
|
for asset in selected_assets: |
|
|
if asset in df_segment.columns: |
|
|
change = pct_changes[asset] |
|
|
direction = "gained" if change > 0 else "lost" |
|
|
data_summary += f"- {asset}: {direction} {abs(change):.2f}%\n" |
|
|
|
|
|
|
|
|
data_summary += "\nVolatility Context (Daily Return Std Dev):\n" |
|
|
daily_rets = df_segment.pct_change() |
|
|
std_devs = daily_rets.std() * 100 |
|
|
for asset in selected_assets: |
|
|
if asset in std_devs.index: |
|
|
data_summary += f"- {asset}: {std_devs[asset]:.2f}%\n" |
|
|
|
|
|
|
|
|
|
|
|
prompt_template = """<|im_start|>system |
|
|
You are a senior financial analyst. Your job is to analyze historical market data trends for selected assets over a specific time period. |
|
|
Provide a concise, professional, and insightful summary of the performance, key trends, and comparative movements based *only* on the provided data summary. |
|
|
Highlight significant gains, losses, or differences in volatility between the assets. |
|
|
|
|
|
### DATA CONTEXT: |
|
|
{data_summary} |
|
|
<|im_end|> |
|
|
<|im_start|>user |
|
|
Generate the historical analysis report. |
|
|
<|im_end|> |
|
|
<|im_start|>assistant |
|
|
""" |
|
|
pt = PromptTemplate(template=prompt_template, input_variables=["data_summary"]) |
|
|
formatted_prompt = pt.format(data_summary=data_summary) |
|
|
|
|
|
|
|
|
response = llm.invoke(formatted_prompt) |
|
|
return response.strip() |
|
|
|
|
|
|
|
|
|
|
|
def analyze_agent_decision(current_market_data_window, proposed_allocations): |
|
|
""" |
|
|
HYBRID ANALYZER: Python does the math, LLM does the talking. |
|
|
""" |
|
|
llm = setup_llm_pipeline() |
|
|
|
|
|
|
|
|
|
|
|
vix_level = current_market_data_window['VIX'].iloc[-1] if 'VIX' in current_market_data_window else 0 |
|
|
|
|
|
|
|
|
risky_assets = {k:v for k,v in proposed_allocations.items() if k not in ['Cash', 'TLT']} |
|
|
if risky_assets: |
|
|
max_asset = max(risky_assets, key=risky_assets.get) |
|
|
max_val = risky_assets[max_asset] * 100 |
|
|
else: |
|
|
max_asset = "None" |
|
|
max_val = 0.0 |
|
|
|
|
|
safe_haven_pct = (proposed_allocations.get('Cash', 0) + proposed_allocations.get('TLT', 0)) * 100 |
|
|
|
|
|
|
|
|
trigger_safe_haven = safe_haven_pct > 80.0 |
|
|
trigger_crash_rule = vix_level > 20.0 and safe_haven_pct < 30.0 |
|
|
trigger_concentration = vix_level > 15.0 and max_val > 40.0 |
|
|
|
|
|
|
|
|
calculated_risk = "MODERATE" |
|
|
reason_code = "Standard market conditions." |
|
|
|
|
|
if trigger_safe_haven: |
|
|
calculated_risk = "LOW" |
|
|
reason_code = f"Safe Haven Exception triggered (Safe Assets: {safe_haven_pct:.1f}% > 80%)." |
|
|
elif trigger_crash_rule: |
|
|
calculated_risk = "HIGH" |
|
|
reason_code = f"Crash Protocol triggered (VIX {vix_level:.1f} > 20 and Safe Haven < 30%)." |
|
|
elif trigger_concentration: |
|
|
calculated_risk = "HIGH" |
|
|
reason_code = f"Concentration Rule triggered (VIX {vix_level:.1f} > 15 and {max_asset} > 40%)." |
|
|
|
|
|
|
|
|
prompt_template = """<|im_start|>system |
|
|
You are a Senior Risk Analyst. |
|
|
The Quantitative Engine has already processed the data and determined the Risk Level. |
|
|
Your job is to summarize the strategy and explain the risk verdict to the user. |
|
|
|
|
|
### QUANTITATIVE ENGINE OUTPUT: |
|
|
- **Determined Risk Level:** {calculated_risk} |
|
|
- **Primary Logic Trigger:** {reason_code} |
|
|
|
|
|
### DATA CONTEXT: |
|
|
- VIX: {vix:.2f} |
|
|
- Largest Position: {max_asset} ({max_val:.1f}%) |
|
|
- Safe Haven Allocation: {safe_pct:.1f}% |
|
|
|
|
|
### INSTRUCTIONS: |
|
|
1. **Strategy Summary:** Describe the allocation style (e.g., "Aggressive Tech", "Defensive Cash"). |
|
|
2. **Justification:** Explain the Risk Level using the "Primary Logic Trigger" provided above. Do not invent new math. |
|
|
|
|
|
Return ONLY raw JSON: |
|
|
{{ |
|
|
"strategy_summary": "string", |
|
|
"risk_level": "{calculated_risk}", |
|
|
"justification": "string", |
|
|
"confidence_score": 10 |
|
|
}} |
|
|
<|im_end|> |
|
|
<|im_start|>user |
|
|
Generate the report. |
|
|
<|im_end|> |
|
|
<|im_start|>assistant |
|
|
""" |
|
|
pt = PromptTemplate(template=prompt_template, input_variables=["calculated_risk", "reason_code", "vix", "max_asset", "max_val", "safe_pct"]) |
|
|
|
|
|
formatted = pt.format( |
|
|
calculated_risk=calculated_risk, |
|
|
reason_code=reason_code, |
|
|
vix=vix_level, |
|
|
max_asset=max_asset, |
|
|
max_val=max_val, |
|
|
safe_pct=safe_haven_pct |
|
|
) |
|
|
|
|
|
res = llm.invoke(formatted) |
|
|
return extract_clean_json(res) |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
print("Running test...") |
|
|
|
|
|
dates = pd.date_range(start="2023-01-01", periods=180, freq='D') |
|
|
df_dummy = pd.DataFrame({ |
|
|
'SPY': np.linspace(400, 450, 180) + np.random.normal(0, 5, 180), |
|
|
'BTC-USD': np.linspace(30000, 40000, 180) + np.random.normal(0, 1000, 180), |
|
|
'VIX': np.linspace(20, 15, 180) |
|
|
}, index=dates) |
|
|
|
|
|
|
|
|
selected = ['SPY', 'BTC-USD'] |
|
|
period = "6 Months" |
|
|
print(f"\nTesting analysis for {selected} over {period}...") |
|
|
analysis = analyze_historical_segment(df_dummy, selected, period) |
|
|
print("\n--- LLM Analysis Output ---") |
|
|
print(analysis) |