ifieryarrows commited on
Commit
4993f5e
·
verified ·
1 Parent(s): 6719be4

Sync from GitHub

Browse files
Files changed (3) hide show
  1. app/ai_engine.py +269 -23
  2. app/models.py +10 -6
  3. app/settings.py +3 -0
app/ai_engine.py CHANGED
@@ -1,5 +1,9 @@
1
  """
2
- AI Engine: FinBERT sentiment scoring + XGBoost training.
 
 
 
 
3
 
4
  Usage:
5
  python -m app.ai_engine --run-all --target-symbol HG=F
@@ -8,13 +12,17 @@ Usage:
8
  """
9
 
10
  import argparse
 
11
  import json
12
  import logging
13
  import os
 
14
  from datetime import datetime, timedelta, timezone
15
  from pathlib import Path
16
  from typing import Optional
17
 
 
 
18
  import numpy as np
19
  import pandas as pd
20
  from sqlalchemy import func
@@ -117,17 +125,207 @@ def score_text_with_finbert(
117
  }
118
 
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  def score_unscored_articles(
121
  session: Session,
122
- batch_size: int = 32
123
  ) -> int:
124
  """
125
  Score all articles that don't have sentiment scores yet.
126
- Idempotent: only scores articles without existing NewsSentiment records.
 
 
 
 
 
127
 
128
  Returns:
129
  Number of articles scored
130
  """
 
 
131
  # Find unscored articles
132
  unscored = session.query(NewsArticle).outerjoin(
133
  NewsSentiment,
@@ -140,37 +338,85 @@ def score_unscored_articles(
140
 
141
  logger.info(f"Found {len(unscored)} unscored articles")
142
 
143
- # Load model
144
- pipe = get_finbert_pipeline()
145
-
146
  scored_count = 0
 
147
 
148
- # Process in batches
149
- for i in range(0, len(unscored), batch_size):
150
- batch = unscored[i:i + batch_size]
 
151
 
152
- for article in batch:
153
- # Use title + description for scoring
154
- text = f"{article.title} {article.description or ''}"
155
-
156
- scores = score_text_with_finbert(pipe, text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  sentiment = NewsSentiment(
159
  news_article_id=article.id,
160
- prob_positive=scores["prob_positive"],
161
- prob_neutral=scores["prob_neutral"],
162
- prob_negative=scores["prob_negative"],
163
- score=scores["score"],
164
- model_name="ProsusAI/finbert",
 
165
  scored_at=datetime.now(timezone.utc)
166
  )
167
 
168
  session.add(sentiment)
169
  scored_count += 1
170
 
171
- # Commit batch
172
  session.commit()
173
- logger.info(f"Scored batch {i // batch_size + 1}: {len(batch)} articles")
 
 
 
 
 
174
 
175
  logger.info(f"Total articles scored: {scored_count}")
176
  return scored_count
@@ -647,7 +893,7 @@ def run_full_pipeline(
647
 
648
  def main():
649
  parser = argparse.ArgumentParser(
650
- description="Run AI pipeline: FinBERT scoring and XGBoost training"
651
  )
652
  parser.add_argument(
653
  "--run-all",
@@ -657,7 +903,7 @@ def main():
657
  parser.add_argument(
658
  "--score-only",
659
  action="store_true",
660
- help="Only run FinBERT scoring"
661
  )
662
  parser.add_argument(
663
  "--aggregate-only",
 
1
  """
2
+ AI Engine: LLM sentiment scoring (with FinBERT fallback) + XGBoost training.
3
+
4
+ Sentiment Analysis:
5
+ Primary: Gemini LLM with copper-specific context (1M token batch)
6
+ Fallback: FinBERT for generic financial sentiment
7
 
8
  Usage:
9
  python -m app.ai_engine --run-all --target-symbol HG=F
 
12
  """
13
 
14
  import argparse
15
+ import asyncio
16
  import json
17
  import logging
18
  import os
19
+ import time
20
  from datetime import datetime, timedelta, timezone
21
  from pathlib import Path
22
  from typing import Optional
23
 
24
+ import httpx
25
+
26
  import numpy as np
27
  import pandas as pd
28
  from sqlalchemy import func
 
125
  }
126
 
127
 
128
+ # =============================================================================
129
+ # LLM Sentiment Scoring (Primary - Gemini)
130
+ # =============================================================================
131
+
132
+ # Copper-specific system prompt for LLM sentiment analysis
133
+ LLM_SENTIMENT_SYSTEM_PROMPT = """You are a copper commodity market sentiment analyst specializing in HG=F (COMEX Copper Futures).
134
+
135
+ Analyze news headlines for their DIRECT impact on copper prices. Be specific to copper - don't just analyze general market sentiment.
136
+
137
+ BULLISH signals (+0.3 to +1.0):
138
+ - Supply disruptions (Chile/Peru mine strikes, closures, weather events)
139
+ - China demand surge (EV production increases, construction boom, infrastructure spending)
140
+ - Green energy investments (solar, wind, grid infrastructure - all copper-intensive)
141
+ - USD weakness (inverse correlation with commodities)
142
+ - Inventory drawdowns (LME/COMEX warehouse stock decreases)
143
+ - M&A activity in copper mining sector
144
+
145
+ BEARISH signals (-0.3 to -1.0):
146
+ - Demand slowdown (China property crisis, global recession fears)
147
+ - Supply increases (new mines coming online, inventory builds)
148
+ - USD strength (pressures all commodities)
149
+ - Trade war escalation (reduces global trade/demand)
150
+ - Substitution news (aluminum replacing copper in applications)
151
+
152
+ NEUTRAL (-0.3 to +0.3):
153
+ - General market news without copper-specific impact
154
+ - Mixed or unclear signals
155
+ - News about other metals without copper correlation
156
+
157
+ IMPORTANT: Return ONLY valid JSON array. No markdown, no explanation outside JSON."""
158
+
159
+
160
+ async def score_batch_with_llm(
161
+ articles: list[dict],
162
+ ) -> list[dict]:
163
+ """
164
+ Score a batch of articles using LLM (Gemini via OpenRouter).
165
+
166
+ Args:
167
+ articles: List of dicts with 'id', 'title', 'description'
168
+
169
+ Returns:
170
+ List of dicts with 'id', 'score', 'reasoning', 'prob_positive', 'prob_neutral', 'prob_negative'
171
+
172
+ Raises:
173
+ Exception on API error or JSON parse failure
174
+ """
175
+ settings = get_settings()
176
+
177
+ if not settings.openrouter_api_key:
178
+ raise RuntimeError("OpenRouter API key not configured")
179
+
180
+ # Build articles text for prompt
181
+ articles_text = "\n".join([
182
+ f"{i+1}. [ID:{a['id']}] {a['title']}" + (f" - {a['description'][:200]}" if a.get('description') else "")
183
+ for i, a in enumerate(articles)
184
+ ])
185
+
186
+ user_prompt = f"""Score these {len(articles)} news articles for copper market sentiment.
187
+
188
+ Articles:
189
+ {articles_text}
190
+
191
+ Return ONLY a valid JSON array with this exact structure (no markdown code blocks):
192
+ [
193
+ {{"id": <article_id>, "score": <float from -1.0 to 1.0>, "reasoning": "<brief explanation>"}},
194
+ ...
195
+ ]
196
+
197
+ Rules:
198
+ - score: -1.0 (very bearish) to +1.0 (very bullish), 0 = neutral
199
+ - reasoning: 1 sentence max explaining the copper market impact
200
+ - Include ALL {len(articles)} articles in your response"""
201
+
202
+ async with httpx.AsyncClient(timeout=60.0) as client:
203
+ response = await client.post(
204
+ "https://openrouter.ai/api/v1/chat/completions",
205
+ headers={
206
+ "Authorization": f"Bearer {settings.openrouter_api_key}",
207
+ "Content-Type": "application/json",
208
+ "HTTP-Referer": "https://copper-mind.vercel.app",
209
+ "X-Title": "CopperMind Sentiment Analysis",
210
+ },
211
+ json={
212
+ "model": settings.llm_sentiment_model,
213
+ "messages": [
214
+ {"role": "system", "content": LLM_SENTIMENT_SYSTEM_PROMPT},
215
+ {"role": "user", "content": user_prompt}
216
+ ],
217
+ "max_tokens": 2000,
218
+ "temperature": 0.3, # Lower temperature for consistent scoring
219
+ }
220
+ )
221
+
222
+ if response.status_code != 200:
223
+ raise RuntimeError(f"OpenRouter API error: {response.status_code} - {response.text}")
224
+
225
+ data = response.json()
226
+ content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
227
+
228
+ if not content:
229
+ raise RuntimeError("Empty response from LLM")
230
+
231
+ # Clean up response - remove markdown code blocks if present
232
+ content = content.strip()
233
+ if content.startswith("```"):
234
+ # Remove ```json and ``` markers
235
+ lines = content.split("\n")
236
+ content = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:])
237
+
238
+ # Parse JSON
239
+ try:
240
+ results = json.loads(content)
241
+ except json.JSONDecodeError as e:
242
+ logger.error(f"LLM JSON parse error: {e}\nContent: {content[:500]}")
243
+ raise
244
+
245
+ # Validate and enrich results
246
+ enriched = []
247
+ for item in results:
248
+ score = float(item.get("score", 0))
249
+ # Clamp score to [-1, 1]
250
+ score = max(-1.0, min(1.0, score))
251
+
252
+ # Derive probabilities from score
253
+ # score = prob_positive - prob_negative
254
+ # Assume prob_neutral is inverse of confidence
255
+ confidence = abs(score)
256
+ if score > 0:
257
+ prob_positive = 0.33 + (confidence * 0.67)
258
+ prob_negative = 0.33 - (confidence * 0.33)
259
+ prob_neutral = 1.0 - prob_positive - prob_negative
260
+ elif score < 0:
261
+ prob_negative = 0.33 + (confidence * 0.67)
262
+ prob_positive = 0.33 - (confidence * 0.33)
263
+ prob_neutral = 1.0 - prob_positive - prob_negative
264
+ else:
265
+ prob_positive = 0.33
266
+ prob_neutral = 0.34
267
+ prob_negative = 0.33
268
+
269
+ enriched.append({
270
+ "id": item.get("id"),
271
+ "score": score,
272
+ "reasoning": item.get("reasoning", ""),
273
+ "prob_positive": round(prob_positive, 4),
274
+ "prob_neutral": round(prob_neutral, 4),
275
+ "prob_negative": round(prob_negative, 4),
276
+ })
277
+
278
+ return enriched
279
+
280
+
281
+ def score_batch_with_finbert(articles: list) -> list[dict]:
282
+ """
283
+ Score articles with FinBERT (fallback when LLM fails).
284
+
285
+ Args:
286
+ articles: List of NewsArticle ORM objects
287
+
288
+ Returns:
289
+ List of dicts with scoring results
290
+ """
291
+ pipe = get_finbert_pipeline()
292
+ results = []
293
+
294
+ for article in articles:
295
+ text = f"{article.title} {article.description or ''}"
296
+ scores = score_text_with_finbert(pipe, text)
297
+
298
+ results.append({
299
+ "id": article.id,
300
+ "score": scores["score"],
301
+ "reasoning": None, # FinBERT doesn't provide reasoning
302
+ "prob_positive": scores["prob_positive"],
303
+ "prob_neutral": scores["prob_neutral"],
304
+ "prob_negative": scores["prob_negative"],
305
+ "model_name": "ProsusAI/finbert",
306
+ })
307
+
308
+ return results
309
+
310
+
311
  def score_unscored_articles(
312
  session: Session,
313
+ chunk_size: int = 20
314
  ) -> int:
315
  """
316
  Score all articles that don't have sentiment scores yet.
317
+
318
+ Strategy:
319
+ - Primary: LLM (Gemini) with copper-specific context
320
+ - Fallback: FinBERT per chunk if LLM fails
321
+ - Chunk size: 20 articles for error isolation
322
+ - Rate limiting: 2 second delay between chunks
323
 
324
  Returns:
325
  Number of articles scored
326
  """
327
+ settings = get_settings()
328
+
329
  # Find unscored articles
330
  unscored = session.query(NewsArticle).outerjoin(
331
  NewsSentiment,
 
338
 
339
  logger.info(f"Found {len(unscored)} unscored articles")
340
 
 
 
 
341
  scored_count = 0
342
+ total_chunks = (len(unscored) + chunk_size - 1) // chunk_size
343
 
344
+ # Process in chunks
345
+ for chunk_idx in range(0, len(unscored), chunk_size):
346
+ chunk = unscored[chunk_idx:chunk_idx + chunk_size]
347
+ chunk_num = chunk_idx // chunk_size + 1
348
 
349
+ logger.info(f"Processing chunk {chunk_num}/{total_chunks} ({len(chunk)} articles)")
350
+
351
+ # Prepare articles for LLM
352
+ articles_data = [
353
+ {"id": a.id, "title": a.title, "description": a.description}
354
+ for a in chunk
355
+ ]
356
+
357
+ results = None
358
+ used_model = settings.llm_sentiment_model
359
+
360
+ # Try LLM first
361
+ if settings.openrouter_api_key:
362
+ try:
363
+ # Run async function in sync context
364
+ loop = asyncio.new_event_loop()
365
+ asyncio.set_event_loop(loop)
366
+ try:
367
+ results = loop.run_until_complete(score_batch_with_llm(articles_data))
368
+ logger.info(f"LLM scored chunk {chunk_num} successfully")
369
+ finally:
370
+ loop.close()
371
+ except Exception as e:
372
+ logger.warning(f"LLM scoring failed for chunk {chunk_num}, falling back to FinBERT: {e}")
373
+ results = None
374
+
375
+ # Fallback to FinBERT if LLM failed or not configured
376
+ if results is None:
377
+ logger.info(f"Using FinBERT fallback for chunk {chunk_num}")
378
+ results = score_batch_with_finbert(chunk)
379
+ used_model = "ProsusAI/finbert"
380
+
381
+ # Create a lookup for results
382
+ results_by_id = {r["id"]: r for r in results}
383
+
384
+ # Save to database
385
+ for article in chunk:
386
+ result = results_by_id.get(article.id)
387
+ if not result:
388
+ # If article not in results (shouldn't happen), use neutral
389
+ logger.warning(f"No result for article {article.id}, using neutral")
390
+ result = {
391
+ "score": 0.0,
392
+ "reasoning": "Missing from LLM response",
393
+ "prob_positive": 0.33,
394
+ "prob_neutral": 0.34,
395
+ "prob_negative": 0.33,
396
+ }
397
 
398
  sentiment = NewsSentiment(
399
  news_article_id=article.id,
400
+ prob_positive=result["prob_positive"],
401
+ prob_neutral=result["prob_neutral"],
402
+ prob_negative=result["prob_negative"],
403
+ score=result["score"],
404
+ reasoning=result.get("reasoning"),
405
+ model_name=result.get("model_name", used_model),
406
  scored_at=datetime.now(timezone.utc)
407
  )
408
 
409
  session.add(sentiment)
410
  scored_count += 1
411
 
412
+ # Commit after each chunk
413
  session.commit()
414
+ logger.info(f"Committed chunk {chunk_num}: {len(chunk)} articles")
415
+
416
+ # Rate limiting: 2 second delay between chunks (except last)
417
+ if chunk_idx + chunk_size < len(unscored):
418
+ logger.debug("Rate limit delay: 2 seconds")
419
+ time.sleep(2)
420
 
421
  logger.info(f"Total articles scored: {scored_count}")
422
  return scored_count
 
893
 
894
  def main():
895
  parser = argparse.ArgumentParser(
896
+ description="Run AI pipeline: LLM sentiment scoring (with FinBERT fallback) and XGBoost training"
897
  )
898
  parser.add_argument(
899
  "--run-all",
 
903
  parser.add_argument(
904
  "--score-only",
905
  action="store_true",
906
+ help="Only run sentiment scoring (LLM primary, FinBERT fallback)"
907
  )
908
  parser.add_argument(
909
  "--aggregate-only",
app/models.py CHANGED
@@ -101,7 +101,9 @@ class PriceBar(Base):
101
 
102
  class NewsSentiment(Base):
103
  """
104
- FinBERT sentiment scores for each news article.
 
 
105
  One-to-one relationship with NewsArticle.
106
  """
107
  __tablename__ = "news_sentiments"
@@ -116,17 +118,19 @@ class NewsSentiment(Base):
116
  index=True
117
  )
118
 
119
- # FinBERT probabilities
120
  prob_positive = Column(Float, nullable=False)
121
  prob_neutral = Column(Float, nullable=False)
122
  prob_negative = Column(Float, nullable=False)
123
 
124
- # Derived score: prob_positive - prob_negative
125
- # Range: [-1, 1], positive means bullish
126
  score = Column(Float, nullable=False, index=True)
127
 
128
- # Model info
129
- model_name = Column(String(100), default="ProsusAI/finbert")
 
 
 
130
 
131
  # When scored
132
  scored_at = Column(DateTime(timezone=True), nullable=False, default=datetime.utcnow)
 
101
 
102
  class NewsSentiment(Base):
103
  """
104
+ Sentiment scores for each news article.
105
+ Primary: LLM (Gemini) with copper-specific context
106
+ Fallback: FinBERT for generic financial sentiment
107
  One-to-one relationship with NewsArticle.
108
  """
109
  __tablename__ = "news_sentiments"
 
118
  index=True
119
  )
120
 
121
+ # Sentiment probabilities (LLM derives these from score)
122
  prob_positive = Column(Float, nullable=False)
123
  prob_neutral = Column(Float, nullable=False)
124
  prob_negative = Column(Float, nullable=False)
125
 
126
+ # Sentiment score: -1 (bearish) to +1 (bullish)
 
127
  score = Column(Float, nullable=False, index=True)
128
 
129
+ # LLM reasoning for the score (debug + future UI display)
130
+ reasoning = Column(Text, nullable=True)
131
+
132
+ # Model info (LLM model or "ProsusAI/finbert" for fallback)
133
+ model_name = Column(String(100), default="google/gemini-2.0-flash-exp:free")
134
 
135
  # When scored
136
  scored_at = Column(DateTime(timezone=True), nullable=False, default=datetime.utcnow)
app/settings.py CHANGED
@@ -67,6 +67,9 @@ class Settings(BaseSettings):
67
  # Twelve Data (Live Price)
68
  twelvedata_api_key: Optional[str] = None
69
 
 
 
 
70
  @property
71
  def symbols_list(self) -> list[str]:
72
  """Parse comma-separated symbols into a list."""
 
67
  # Twelve Data (Live Price)
68
  twelvedata_api_key: Optional[str] = None
69
 
70
+ # LLM Sentiment Analysis (replaces FinBERT)
71
+ llm_sentiment_model: str = "google/gemini-2.0-flash-exp:free"
72
+
73
  @property
74
  def symbols_list(self) -> list[str]:
75
  """Parse comma-separated symbols into a list."""