Jitendra12421 commited on
Commit
243ed84
·
verified ·
1 Parent(s): bb13ea9

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +138 -127
  2. extractor.py +35 -7
  3. scraper.py +7 -2
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py
2
  import gradio as gr
3
  import asyncio
4
  import os
@@ -30,97 +30,108 @@ MODEL_PATH = "NSEI_model.pkl"
30
  try:
31
  model = StockNewsModel.load(MODEL_PATH)
32
  print(f"Loaded pre-trained model from {MODEL_PATH}")
33
- except Exception as e:
34
- print(f"Warning: Could not load model from {MODEL_PATH}. Models need to be trained first.")
35
- model = None
36
-
37
- async def fetch_and_predict(ticker="^NSEI", days_back=3):
38
- if not model:
39
- return {"error": "Model not loaded. Please train the model first."}
40
-
41
- scraper = NewsScraper(limit=50) # smaller limit for quick inference
42
- extractor = ContentExtractor()
43
- features = Features(ticker)
44
-
45
- # 1. Scrape latest news
46
- lookback = datetime.now() - timedelta(days=days_back)
47
- articles = await scraper.scrape(ticker, lookback)
48
-
49
- if not articles:
50
- return {"message": f"No recent news found for {ticker}."}
51
-
52
- # 2. Extract content
53
- articles = await extractor.extract_all(articles)
54
-
55
- # 3. Create DataFrame and prepare for features
56
- df = pd.DataFrame(articles)
57
- df['ts'] = pd.to_datetime(df['timestamp'], errors='coerce', utc=True)
58
- df = df.dropna(subset=['ts'])
59
- df['date'] = df['ts'].dt.date
60
-
61
- if df.empty:
62
- return {"message": "No valid timestamps found in articles."}
63
-
64
- # 4. Extract basic text/NLP features
65
- df_features = features.build(df)
66
-
67
- # 5. Get market context
68
- # Use DataPipeline's _get_prices to fetch context
69
- pipeline = DataPipeline(ticker, train_days=0, test_days=0)
70
- # We fetch prices from the last 30 days to calculate moving averages
71
- price_df = pipeline.get_prices(datetime.now() - timedelta(days=30))
72
- df_features = pipeline._add_market_context(df_features, price_df)
73
-
74
- # Prepare features matching the model
75
- avail_feats = [c for c in DataPipeline.FEATURE_COLS if c in df_features.columns]
76
-
77
- # Fill any missing columns that the model expects with 0
78
- for col in model.feature_names:
79
- if col not in df_features.columns:
80
- df_features[col] = 0.0
81
-
82
- X = df_features[model.feature_names].fillna(0).replace([float('inf'), float('-inf')], 0)
83
-
84
- # 6. Predict
85
- results = model.predict_new(X)
86
-
87
- # Merge results with original article details
88
- df_results = pd.concat([df_features, results], axis=1)
89
- df_results = df_results.sort_values(by='impact', ascending=False)
90
-
91
- # Format output
92
- top_articles = []
93
- for i, row in df_results.head(10).iterrows():
94
- # Clean title & logic
95
- title = str(row.get('title', ''))
96
- source = str(row.get('source_tier', 'Unknown'))
97
-
98
- # Determine source string from tier if possible
99
- source_name = "News Source"
100
- for k, v in Features.SOURCES.items():
101
- if k in title.lower():
102
- source_name = k.title()
103
- break
104
-
105
- # Try to parse published date gracefully
106
- pub_d = row.get('pub_date', '')
107
- # Fallback snippet
108
- snippet = str(row.get('content', ''))[:300] + "..." if len(str(row.get('content', ''))) > 300 else str(row.get('content', ''))
109
-
110
- top_articles.append({
111
- "id": i,
112
- "title": title,
113
- "source": source_name,
114
- "date": pub_d,
115
- "url": row.get('link', ''),
116
- "impact_score": round(row.get('impact', 0), 3),
117
- "sentiment": round(row.get('sent_combined', 0), 3),
118
- "content": f"<p>{snippet}</p>"
119
- })
120
-
121
- return top_articles
122
-
123
- # --- START CACHING LOGIC ---
 
 
 
 
 
 
 
 
 
 
 
124
  cached_headlines = None
125
  last_refresh_date = None
126
  CACHE_LOCK = threading.Lock()
@@ -132,11 +143,11 @@ def update_cache(ticker="^NSEI"):
132
  global cached_headlines, last_refresh_date
133
  print(f"Fetching new daily market insights for {ticker}...")
134
  try:
135
- data = asyncio.run(fetch_and_predict(ticker, days_back=3))
136
- with CACHE_LOCK:
137
- cached_headlines = data
138
- last_refresh_date = datetime.now(IST).date()
139
- print("Market insights cache successfully updated.")
140
  except Exception as e:
141
  print(f"Error fetching insights: {e}")
142
  traceback.print_exc()
@@ -223,33 +234,33 @@ def get_predictions(ticker="^NSEI"):
223
  # First request triggers a background refresh instead of blocking app startup.
224
  _start_initial_refresh(ticker)
225
  return [{"message": "Generating insights for the day... Check back in a minute."}]
226
- # --- END CACHING LOGIC ---
227
-
228
- def demo():
229
- with gr.Blocks(title="Miscellaneous News Impact Analyzer") as app:
230
- gr.Markdown("# Miscellaneous Model Backend")
231
-
232
- with gr.Row():
233
- ticker_input = gr.Textbox(label="Ticker Symbol", value="^NSEI")
234
-
235
- btn = gr.Button("Fetch Latest Impactful News")
236
- output = gr.JSON(label="Top Articles")
237
-
238
- btn.click(
239
- fn=get_predictions,
240
- inputs=[ticker_input],
241
- outputs=[output],
242
- api_name="predict"
243
- )
244
-
245
- return app
246
-
247
- app = demo()
248
-
249
- if __name__ == "__main__":
250
- app.queue().launch(
251
- server_name="0.0.0.0",
252
- server_port=int(os.environ.get("PORT", "7860")),
253
- ssr_mode=False,
254
- show_error=True
255
- )
 
1
+ # app.py
2
  import gradio as gr
3
  import asyncio
4
  import os
 
30
  try:
31
  model = StockNewsModel.load(MODEL_PATH)
32
  print(f"Loaded pre-trained model from {MODEL_PATH}")
33
+ except Exception as e:
34
+ print(f"Warning: Could not load model from {MODEL_PATH}. Models need to be trained first.")
35
+ model = None
36
+
37
+ async def fetch_and_predict(ticker="^NSEI", days_back=3):
38
+ if not model:
39
+ return {"error": "Model not loaded. Please train the model first."}
40
+
41
+ scraper = NewsScraper(limit=150) # Fetch more headlines for the first pass
42
+ extractor = ContentExtractor()
43
+ features = Features(ticker)
44
+
45
+ # 1. Scrape latest news (Fast Pass)
46
+ lookback = datetime.now() - timedelta(days=days_back)
47
+ articles = await scraper.scrape(ticker, lookback)
48
+
49
+ if not articles:
50
+ return {"message": f"No recent news found for {ticker}."}
51
+
52
+ # 2. Prepare for Initial Pass (Quick ML)
53
+ df = pd.DataFrame(articles)
54
+ # Map RSS 'description' to 'content' for the initial feature engineering pass
55
+ df['content'] = df['description'].fillna('')
56
+ df['ts'] = pd.to_datetime(df['timestamp'], errors='coerce', utc=True)
57
+ df = df.dropna(subset=['ts'])
58
+ df['date'] = df['ts'].dt.date
59
+
60
+ if df.empty:
61
+ return {"message": "No valid timestamps found in articles."}
62
+
63
+ # 3. Initial ML Ranking
64
+ df_init_feats = features.build(df)
65
+ pipeline = DataPipeline(ticker, train_days=0, test_days=0)
66
+ price_df = pipeline.get_prices(datetime.now() - timedelta(days=30))
67
+ df_init_feats = pipeline._add_market_context(df_init_feats, price_df)
68
+
69
+ # Prepare features for ranking
70
+ for col in model.feature_names:
71
+ if col not in df_init_feats.columns:
72
+ df_init_feats[col] = 0.0
73
+ X_init = df_init_feats[model.feature_names].fillna(0).replace([float('inf'), float('-inf')], 0)
74
+ init_results = model.predict_new(X_init)
75
+
76
+ # Merge and Sort
77
+ df_ranked = pd.concat([df, init_results], axis=1)
78
+ df_ranked = df_ranked.sort_values(by='impact', ascending=False)
79
+
80
+ # 4. Filtering (Survivor Selection)
81
+ # Select top 12 candidates for Deep Extraction
82
+ survivors = df_ranked.head(12).to_dict('records')
83
+ print(f"[Pipeline] High-impact filtering complete. Deep extracting {len(survivors)} survivor(s).")
84
+
85
+ # 5. Deep Extraction (Full Body + Images)
86
+ survivors = await extractor.extract_all(survivors)
87
+
88
+ # 6. Final Enrichment (Accurate ML Pass)
89
+ df_final = pd.DataFrame(survivors)
90
+ df_final_feats = features.build(df_final) # Now with full body content
91
+ df_final_feats = pipeline._add_market_context(df_final_feats, price_df)
92
+
93
+ # Prepare features for final scoring
94
+ for col in model.feature_names:
95
+ if col not in df_final_feats.columns:
96
+ df_final_feats[col] = 0.0
97
+ X_final = df_final_feats[model.feature_names].fillna(0).replace([float('inf'), float('-inf')], 0)
98
+ final_results = model.predict_new(X_final)
99
+
100
+ # Final Sort
101
+ # Drop old 'impact' from initial pass to use new accurate one
102
+ df_final_clean = df_final.drop(columns=['impact', 'confidence'], errors='ignore')
103
+ df_final_scores = pd.concat([df_final_clean, final_results], axis=1)
104
+ df_final_scores = df_final_scores.sort_values(by='impact', ascending=False)
105
+
106
+ # 7. Format final JSON output
107
+ top_articles = []
108
+ for i, row in df_final_scores.head(10).iterrows():
109
+ title = str(row.get('title', ''))
110
+ # Use the source attribute from RSS if available, otherwise fallback to name mapping
111
+ source_name = str(row.get('source', 'Unknown'))
112
+ if source_name == "Unknown":
113
+ for k, v in Features.SOURCES.items():
114
+ if k in title.lower():
115
+ source_name = k.title()
116
+ break
117
+
118
+ snippet = str(row.get('content', ''))[:400] + "..." if len(str(row.get('content', ''))) > 400 else str(row.get('content', ''))
119
+
120
+ top_articles.append({
121
+ "id": i,
122
+ "title": title,
123
+ "source": source_name,
124
+ "date": row.get('pub_date', ''),
125
+ "url": row.get('link', ''),
126
+ "image": row.get('image', ''), # Now populated from deep extraction
127
+ "impact_score": round(row.get('impact', 0), 3),
128
+ "sentiment": round(row.get('sent_combined', 0), 3),
129
+ "content": f"<p>{snippet}</p>"
130
+ })
131
+
132
+ return top_articles
133
+
134
+ # --- START CACHING LOGIC ---
135
  cached_headlines = None
136
  last_refresh_date = None
137
  CACHE_LOCK = threading.Lock()
 
143
  global cached_headlines, last_refresh_date
144
  print(f"Fetching new daily market insights for {ticker}...")
145
  try:
146
+ data = asyncio.run(fetch_and_predict(ticker, days_back=3))
147
+ with CACHE_LOCK:
148
+ cached_headlines = data
149
+ last_refresh_date = datetime.now(IST).date()
150
+ print("Market insights cache successfully updated.")
151
  except Exception as e:
152
  print(f"Error fetching insights: {e}")
153
  traceback.print_exc()
 
234
  # First request triggers a background refresh instead of blocking app startup.
235
  _start_initial_refresh(ticker)
236
  return [{"message": "Generating insights for the day... Check back in a minute."}]
237
+ # --- END CACHING LOGIC ---
238
+
239
+ def demo():
240
+ with gr.Blocks(title="Miscellaneous News Impact Analyzer") as app:
241
+ gr.Markdown("# Miscellaneous Model Backend")
242
+
243
+ with gr.Row():
244
+ ticker_input = gr.Textbox(label="Ticker Symbol", value="^NSEI")
245
+
246
+ btn = gr.Button("Fetch Latest Impactful News")
247
+ output = gr.JSON(label="Top Articles")
248
+
249
+ btn.click(
250
+ fn=get_predictions,
251
+ inputs=[ticker_input],
252
+ outputs=[output],
253
+ api_name="predict"
254
+ )
255
+
256
+ return app
257
+
258
+ app = demo()
259
+
260
+ if __name__ == "__main__":
261
+ app.queue().launch(
262
+ server_name="0.0.0.0",
263
+ server_port=int(os.environ.get("PORT", "7860")),
264
+ ssr_mode=False,
265
+ show_error=True
266
+ )
extractor.py CHANGED
@@ -24,25 +24,50 @@ class ContentExtractor:
24
  return self._parse_html(html)
25
  except:
26
  pass
27
- return ""
28
 
29
  def _parse_html(self, html):
30
  try:
31
  soup = BeautifulSoup(html, 'html.parser')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  for tag in soup(
33
  ['script','style','nav','header','footer',
34
  'aside','iframe','noscript','form']
35
  ):
36
  tag.decompose()
 
37
  article = soup.find('article')
38
  paras = (article or soup).find_all('p')
39
  parts = [
40
  p.get_text(strip=True) for p in paras
41
  if len(p.get_text(strip=True)) > 30
42
  ]
43
- return ' '.join(parts)[:3000]
 
 
44
  except:
45
- return ""
46
 
47
  async def extract_all(self, articles):
48
  conn = aiohttp.TCPConnector(limit=25, ssl=self.ssl_ctx)
@@ -56,9 +81,12 @@ class ContentExtractor:
56
  results = await asyncio.gather(
57
  *tasks, return_exceptions=True
58
  )
59
- for j, content in enumerate(results):
60
- articles[i+j]['content'] = (
61
- content if isinstance(content, str) else ""
62
- )
 
 
 
63
  await asyncio.sleep(0.15)
64
  return articles
 
24
  return self._parse_html(html)
25
  except:
26
  pass
27
+ return {"content": "", "image": ""}
28
 
29
  def _parse_html(self, html):
30
  try:
31
  soup = BeautifulSoup(html, 'html.parser')
32
+
33
+ # --- Image Extraction ---
34
+ img_url = ""
35
+ # 1. OpenGraph/Twitter Meta Tags (highest reliability)
36
+ meta_img = soup.find("meta", property="og:image") or \
37
+ soup.find("meta", attrs={"name": "twitter:image"}) or \
38
+ soup.find("meta", attrs={"name": "og:image"})
39
+
40
+ if meta_img and meta_img.get("content"):
41
+ img_url = meta_img["content"]
42
+
43
+ # 2. Fallback to largest/first relevant image if meta fails
44
+ if not img_url:
45
+ for img in soup.find_all("img"):
46
+ src = img.get("src")
47
+ if src and src.startswith("http") and any(x in src.lower() for x in [".jpg", ".png", ".jpeg"]):
48
+ # Skip small icons
49
+ if "icon" not in src.lower() and "logo" not in src.lower():
50
+ img_url = src
51
+ break
52
+ # --- End Image Extraction ---
53
+
54
  for tag in soup(
55
  ['script','style','nav','header','footer',
56
  'aside','iframe','noscript','form']
57
  ):
58
  tag.decompose()
59
+
60
  article = soup.find('article')
61
  paras = (article or soup).find_all('p')
62
  parts = [
63
  p.get_text(strip=True) for p in paras
64
  if len(p.get_text(strip=True)) > 30
65
  ]
66
+ content = ' '.join(parts)[:3000]
67
+
68
+ return {"content": content, "image": img_url}
69
  except:
70
+ return {"content": "", "image": ""}
71
 
72
  async def extract_all(self, articles):
73
  conn = aiohttp.TCPConnector(limit=25, ssl=self.ssl_ctx)
 
81
  results = await asyncio.gather(
82
  *tasks, return_exceptions=True
83
  )
84
+ for j, res in enumerate(results):
85
+ if isinstance(res, dict):
86
+ articles[i+j]['content'] = res.get('content', "")
87
+ articles[i+j]['image'] = res.get('image', "")
88
+ else:
89
+ articles[i+j]['content'] = ""
90
+ articles[i+j]['image'] = ""
91
  await asyncio.sleep(0.15)
92
  return articles
scraper.py CHANGED
@@ -34,14 +34,19 @@ class NewsScraper:
34
  t = item.findtext('title')
35
  l = item.findtext('link')
36
  p = item.findtext('pubDate')
 
 
37
  if t and l and p:
38
  try:
39
  dt = parsedate_to_datetime(p)
40
  if dt.date() >= lb:
41
  articles.append({
42
- 'title': t, 'link': l,
 
43
  'pub_date': p,
44
- 'timestamp': dt.isoformat()
 
 
45
  })
46
  except:
47
  pass
 
34
  t = item.findtext('title')
35
  l = item.findtext('link')
36
  p = item.findtext('pubDate')
37
+ s = item.findtext('source') # Extract source name
38
+ d = item.findtext('description') # Extract snippet description
39
  if t and l and p:
40
  try:
41
  dt = parsedate_to_datetime(p)
42
  if dt.date() >= lb:
43
  articles.append({
44
+ 'title': t,
45
+ 'link': l,
46
  'pub_date': p,
47
+ 'timestamp': dt.isoformat(),
48
+ 'source': s if s else "Unknown",
49
+ 'description': d if d else ""
50
  })
51
  except:
52
  pass