Spaces:

Jitendra12421
/

Miscellonoues_model_backend

Sleeping

App Files Files Community

Jitendra12421 commited on Mar 26

Commit

243ed84

verified ·

1 Parent(s): bb13ea9

Upload 3 files

Browse files

Files changed (3) hide show

app.py +138 -127
extractor.py +35 -7
scraper.py +7 -2

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py
 import gradio as gr
 import asyncio
 import os
@@ -30,97 +30,108 @@ MODEL_PATH = "NSEI_model.pkl"
 try:
     model = StockNewsModel.load(MODEL_PATH)
     print(f"Loaded pre-trained model from {MODEL_PATH}")
-except Exception as e:
-    print(f"Warning: Could not load model from {MODEL_PATH}. Models need to be trained first.")
-    model = None
-async def fetch_and_predict(ticker="^NSEI", days_back=3):
-    if not model:
-        return {"error": "Model not loaded. Please train the model first."}
-    scraper = NewsScraper(limit=50) # smaller limit for quick inference
-    extractor = ContentExtractor()
-    features = Features(ticker)
-    # 1. Scrape latest news
-    lookback = datetime.now() - timedelta(days=days_back)
-    articles = await scraper.scrape(ticker, lookback)
-    if not articles:
-        return {"message": f"No recent news found for {ticker}."}
-    # 2. Extract content
-    articles = await extractor.extract_all(articles)
-    # 3. Create DataFrame and prepare for features
-    df = pd.DataFrame(articles)
-    df['ts'] = pd.to_datetime(df['timestamp'], errors='coerce', utc=True)
-    df = df.dropna(subset=['ts'])
-    df['date'] = df['ts'].dt.date
-    if df.empty:
-        return {"message": "No valid timestamps found in articles."}
-    # 4. Extract basic text/NLP features
-    df_features = features.build(df)
-    # 5. Get market context
-    # Use DataPipeline's _get_prices to fetch context
-    pipeline = DataPipeline(ticker, train_days=0, test_days=0)
-    # We fetch prices from the last 30 days to calculate moving averages
-    price_df = pipeline.get_prices(datetime.now() - timedelta(days=30))
-    df_features = pipeline._add_market_context(df_features, price_df)
-    # Prepare features matching the model
-    avail_feats = [c for c in DataPipeline.FEATURE_COLS if c in df_features.columns]
-    # Fill any missing columns that the model expects with 0
-    for col in model.feature_names:
-        if col not in df_features.columns:
-            df_features[col] = 0.0
-    X = df_features[model.feature_names].fillna(0).replace([float('inf'), float('-inf')], 0)
-    # 6. Predict
-    results = model.predict_new(X)
-    # Merge results with original article details
-    df_results = pd.concat([df_features, results], axis=1)
-    df_results = df_results.sort_values(by='impact', ascending=False)
-    # Format output
-    top_articles = []
-    for i, row in df_results.head(10).iterrows():
-        # Clean title & logic
-        title = str(row.get('title', ''))
-        source = str(row.get('source_tier', 'Unknown'))
-        # Determine source string from tier if possible
-        source_name = "News Source"
-        for k, v in Features.SOURCES.items():
-            if k in title.lower():
-                source_name = k.title()
-                break
-        # Try to parse published date gracefully
-        pub_d = row.get('pub_date', '')
-        # Fallback snippet
-        snippet = str(row.get('content', ''))[:300] + "..." if len(str(row.get('content', ''))) > 300 else str(row.get('content', ''))
-        top_articles.append({
-            "id": i,
-            "title": title,
-            "source": source_name,
-            "date": pub_d,
-            "url": row.get('link', ''),
-            "impact_score": round(row.get('impact', 0), 3),
-            "sentiment": round(row.get('sent_combined', 0), 3),
-            "content": f"<p>{snippet}</p>"
-        })
-    return top_articles
-# --- START CACHING LOGIC ---
 cached_headlines = None
 last_refresh_date = None
 CACHE_LOCK = threading.Lock()
@@ -132,11 +143,11 @@ def update_cache(ticker="^NSEI"):
     global cached_headlines, last_refresh_date
     print(f"Fetching new daily market insights for {ticker}...")
     try:
-        data = asyncio.run(fetch_and_predict(ticker, days_back=3))
-        with CACHE_LOCK:
-            cached_headlines = data
-            last_refresh_date = datetime.now(IST).date()
-        print("Market insights cache successfully updated.")
     except Exception as e:
         print(f"Error fetching insights: {e}")
         traceback.print_exc()
@@ -223,33 +234,33 @@ def get_predictions(ticker="^NSEI"):
     # First request triggers a background refresh instead of blocking app startup.
     _start_initial_refresh(ticker)
     return [{"message": "Generating insights for the day... Check back in a minute."}]
-# --- END CACHING LOGIC ---
-def demo():
-    with gr.Blocks(title="Miscellaneous News Impact Analyzer") as app:
-        gr.Markdown("# Miscellaneous Model Backend")
-        with gr.Row():
-            ticker_input = gr.Textbox(label="Ticker Symbol", value="^NSEI")
-        btn = gr.Button("Fetch Latest Impactful News")
-        output = gr.JSON(label="Top Articles")
-        btn.click(
-            fn=get_predictions,
-            inputs=[ticker_input],
-            outputs=[output],
-            api_name="predict"
-        )
-    return app
-app = demo()
-if __name__ == "__main__":
-    app.queue().launch(
-        server_name="0.0.0.0",
-        server_port=int(os.environ.get("PORT", "7860")),
-        ssr_mode=False,
-        show_error=True
-    )

+# app.py
 import gradio as gr
 import asyncio
 import os
 try:
     model = StockNewsModel.load(MODEL_PATH)
     print(f"Loaded pre-trained model from {MODEL_PATH}")
+except Exception as e:
+    print(f"Warning: Could not load model from {MODEL_PATH}. Models need to be trained first.")
+    model = None
+async def fetch_and_predict(ticker="^NSEI", days_back=3):
+    if not model:
+        return {"error": "Model not loaded. Please train the model first."}
+    scraper = NewsScraper(limit=150) # Fetch more headlines for the first pass
+    extractor = ContentExtractor()
+    features = Features(ticker)
+    # 1. Scrape latest news (Fast Pass)
+    lookback = datetime.now() - timedelta(days=days_back)
+    articles = await scraper.scrape(ticker, lookback)
+    if not articles:
+        return {"message": f"No recent news found for {ticker}."}
+    # 2. Prepare for Initial Pass (Quick ML)
+    df = pd.DataFrame(articles)
+    # Map RSS 'description' to 'content' for the initial feature engineering pass
+    df['content'] = df['description'].fillna('')
+    df['ts'] = pd.to_datetime(df['timestamp'], errors='coerce', utc=True)
+    df = df.dropna(subset=['ts'])
+    df['date'] = df['ts'].dt.date
+    if df.empty:
+        return {"message": "No valid timestamps found in articles."}
+    # 3. Initial ML Ranking
+    df_init_feats = features.build(df)
+    pipeline = DataPipeline(ticker, train_days=0, test_days=0)
+    price_df = pipeline.get_prices(datetime.now() - timedelta(days=30))
+    df_init_feats = pipeline._add_market_context(df_init_feats, price_df)
+    # Prepare features for ranking
+    for col in model.feature_names:
+        if col not in df_init_feats.columns:
+            df_init_feats[col] = 0.0
+    X_init = df_init_feats[model.feature_names].fillna(0).replace([float('inf'), float('-inf')], 0)
+    init_results = model.predict_new(X_init)
+    # Merge and Sort
+    df_ranked = pd.concat([df, init_results], axis=1)
+    df_ranked = df_ranked.sort_values(by='impact', ascending=False)
+    # 4. Filtering (Survivor Selection)
+    # Select top 12 candidates for Deep Extraction
+    survivors = df_ranked.head(12).to_dict('records')
+    print(f"[Pipeline] High-impact filtering complete. Deep extracting {len(survivors)} survivor(s).")
+    # 5. Deep Extraction (Full Body + Images)
+    survivors = await extractor.extract_all(survivors)
+    # 6. Final Enrichment (Accurate ML Pass)
+    df_final = pd.DataFrame(survivors)
+    df_final_feats = features.build(df_final) # Now with full body content
+    df_final_feats = pipeline._add_market_context(df_final_feats, price_df)
+    # Prepare features for final scoring
+    for col in model.feature_names:
+        if col not in df_final_feats.columns:
+            df_final_feats[col] = 0.0
+    X_final = df_final_feats[model.feature_names].fillna(0).replace([float('inf'), float('-inf')], 0)
+    final_results = model.predict_new(X_final)
+    # Final Sort
+    # Drop old 'impact' from initial pass to use new accurate one
+    df_final_clean = df_final.drop(columns=['impact', 'confidence'], errors='ignore')
+    df_final_scores = pd.concat([df_final_clean, final_results], axis=1)
+    df_final_scores = df_final_scores.sort_values(by='impact', ascending=False)
+    # 7. Format final JSON output
+    top_articles = []
+    for i, row in df_final_scores.head(10).iterrows():
+        title = str(row.get('title', ''))
+        # Use the source attribute from RSS if available, otherwise fallback to name mapping
+        source_name = str(row.get('source', 'Unknown'))
+        if source_name == "Unknown":
+             for k, v in Features.SOURCES.items():
+                if k in title.lower():
+                    source_name = k.title()
+                    break
+        snippet = str(row.get('content', ''))[:400] + "..." if len(str(row.get('content', ''))) > 400 else str(row.get('content', ''))
+        top_articles.append({
+            "id": i,
+            "title": title,
+            "source": source_name,
+            "date": row.get('pub_date', ''),
+            "url": row.get('link', ''),
+            "image": row.get('image', ''), # Now populated from deep extraction
+            "impact_score": round(row.get('impact', 0), 3),
+            "sentiment": round(row.get('sent_combined', 0), 3),
+            "content": f"<p>{snippet}</p>"
+        })
+    return top_articles
+# --- START CACHING LOGIC ---
 cached_headlines = None
 last_refresh_date = None
 CACHE_LOCK = threading.Lock()
     global cached_headlines, last_refresh_date
     print(f"Fetching new daily market insights for {ticker}...")
     try:
+        data = asyncio.run(fetch_and_predict(ticker, days_back=3))
+        with CACHE_LOCK:
+            cached_headlines = data
+            last_refresh_date = datetime.now(IST).date()
+        print("Market insights cache successfully updated.")
     except Exception as e:
         print(f"Error fetching insights: {e}")
         traceback.print_exc()
     # First request triggers a background refresh instead of blocking app startup.
     _start_initial_refresh(ticker)
     return [{"message": "Generating insights for the day... Check back in a minute."}]
+# --- END CACHING LOGIC ---
+def demo():
+    with gr.Blocks(title="Miscellaneous News Impact Analyzer") as app:
+        gr.Markdown("# Miscellaneous Model Backend")
+        with gr.Row():
+            ticker_input = gr.Textbox(label="Ticker Symbol", value="^NSEI")
+        btn = gr.Button("Fetch Latest Impactful News")
+        output = gr.JSON(label="Top Articles")
+        btn.click(
+            fn=get_predictions,
+            inputs=[ticker_input],
+            outputs=[output],
+            api_name="predict"
+        )
+    return app
+app = demo()
+if __name__ == "__main__":
+    app.queue().launch(
+        server_name="0.0.0.0",
+        server_port=int(os.environ.get("PORT", "7860")),
+        ssr_mode=False,
+        show_error=True
+    )

extractor.py CHANGED Viewed

@@ -24,25 +24,50 @@ class ContentExtractor:
                     return self._parse_html(html)
         except:
             pass
-        return ""
     def _parse_html(self, html):
         try:
             soup = BeautifulSoup(html, 'html.parser')
             for tag in soup(
                 ['script','style','nav','header','footer',
                  'aside','iframe','noscript','form']
             ):
                 tag.decompose()
             article = soup.find('article')
             paras = (article or soup).find_all('p')
             parts = [
                 p.get_text(strip=True) for p in paras
                 if len(p.get_text(strip=True)) > 30
             ]
-            return ' '.join(parts)[:3000]
         except:
-            return ""
     async def extract_all(self, articles):
         conn = aiohttp.TCPConnector(limit=25, ssl=self.ssl_ctx)
@@ -56,9 +81,12 @@ class ContentExtractor:
                 results = await asyncio.gather(
                     *tasks, return_exceptions=True
                 )
-                for j, content in enumerate(results):
-                    articles[i+j]['content'] = (
-                        content if isinstance(content, str) else ""
-                    )
                 await asyncio.sleep(0.15)
         return articles

                     return self._parse_html(html)
         except:
             pass
+        return {"content": "", "image": ""}
     def _parse_html(self, html):
         try:
             soup = BeautifulSoup(html, 'html.parser')
+            # --- Image Extraction ---
+            img_url = ""
+            # 1. OpenGraph/Twitter Meta Tags (highest reliability)
+            meta_img = soup.find("meta", property="og:image") or \
+                       soup.find("meta", attrs={"name": "twitter:image"}) or \
+                       soup.find("meta", attrs={"name": "og:image"})
+            if meta_img and meta_img.get("content"):
+                img_url = meta_img["content"]
+            # 2. Fallback to largest/first relevant image if meta fails
+            if not img_url:
+                for img in soup.find_all("img"):
+                    src = img.get("src")
+                    if src and src.startswith("http") and any(x in src.lower() for x in [".jpg", ".png", ".jpeg"]):
+                        # Skip small icons
+                        if "icon" not in src.lower() and "logo" not in src.lower():
+                            img_url = src
+                            break
+            # --- End Image Extraction ---
             for tag in soup(
                 ['script','style','nav','header','footer',
                  'aside','iframe','noscript','form']
             ):
                 tag.decompose()
             article = soup.find('article')
             paras = (article or soup).find_all('p')
             parts = [
                 p.get_text(strip=True) for p in paras
                 if len(p.get_text(strip=True)) > 30
             ]
+            content = ' '.join(parts)[:3000]
+            return {"content": content, "image": img_url}
         except:
+            return {"content": "", "image": ""}
     async def extract_all(self, articles):
         conn = aiohttp.TCPConnector(limit=25, ssl=self.ssl_ctx)
                 results = await asyncio.gather(
                     *tasks, return_exceptions=True
                 )
+                for j, res in enumerate(results):
+                    if isinstance(res, dict):
+                        articles[i+j]['content'] = res.get('content', "")
+                        articles[i+j]['image'] = res.get('image', "")
+                    else:
+                        articles[i+j]['content'] = ""
+                        articles[i+j]['image'] = ""
                 await asyncio.sleep(0.15)
         return articles

scraper.py CHANGED Viewed

@@ -34,14 +34,19 @@ class NewsScraper:
                 t = item.findtext('title')
                 l = item.findtext('link')
                 p = item.findtext('pubDate')
                 if t and l and p:
                     try:
                         dt = parsedate_to_datetime(p)
                         if dt.date() >= lb:
                             articles.append({
-                                'title': t, 'link': l,
                                 'pub_date': p,
-                                'timestamp': dt.isoformat()
                             })
                     except:
                         pass

                 t = item.findtext('title')
                 l = item.findtext('link')
                 p = item.findtext('pubDate')
+                s = item.findtext('source') # Extract source name
+                d = item.findtext('description') # Extract snippet description
                 if t and l and p:
                     try:
                         dt = parsedate_to_datetime(p)
                         if dt.date() >= lb:
                             articles.append({
+                                'title': t,
+                                'link': l,
                                 'pub_date': p,
+                                'timestamp': dt.isoformat(),
+                                'source': s if s else "Unknown",
+                                'description': d if d else ""
                             })
                     except:
                         pass