Jitendra12421 commited on
Commit
f17c710
·
verified ·
1 Parent(s): 6e6d45c

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +2 -2
  2. extractor.py +8 -3
  3. scraper.py +28 -31
app.py CHANGED
@@ -63,13 +63,13 @@ def _run_training_job():
63
 
64
  _load_model_from_disk()
65
 
66
- async def fetch_and_predict(ticker="^NSEI", days_back=3):
67
  with MODEL_LOCK:
68
  current_model = model
69
  if not current_model:
70
  return {"error": "Model not loaded. Please train the model first."}
71
 
72
- scraper = NewsScraper(limit=150) # Fetch more headlines for the first pass
73
  extractor = ContentExtractor()
74
  features = Features(ticker)
75
 
 
63
 
64
  _load_model_from_disk()
65
 
66
+ async def fetch_and_predict(ticker="^NSEI", days_back=7):
67
  with MODEL_LOCK:
68
  current_model = model
69
  if not current_model:
70
  return {"error": "Model not loaded. Please train the model first."}
71
 
72
+ scraper = NewsScraper(limit=450) # Fetch 450+ headlines for the ML model
73
  extractor = ContentExtractor()
74
  features = Features(ticker)
75
 
extractor.py CHANGED
@@ -31,9 +31,14 @@ class ContentExtractor:
31
  parts.extend(words[:3])
32
 
33
  # Use loremflickr which is a reliable replacement for keyword-based placeholders.
34
- # Format: https://loremflickr.com/1200/675/keyword1,keyword2
35
- query = ",".join(parts[:5]).strip() or "market,finance"
36
- return f"https://loremflickr.com/1200/675/{query}"
 
 
 
 
 
37
 
38
  async def _fetch_one(self, session, url):
39
  try:
 
31
  parts.extend(words[:3])
32
 
33
  # Use loremflickr which is a reliable replacement for keyword-based placeholders.
34
+ # Format: https://loremflickr.com/1200/675/keyword1,keyword2?random=N
35
+ # We append a hash of the title as a 'random' seed to ensure uniqueness for different articles,
36
+ # but consistency for recovery if the page is refreshed.
37
+ query = ",".join(parts[:5]).strip() or "finance,stock,market"
38
+ # Always mix in a professional keyword to override potential cat/statue fallbacks
39
+ query = "business,trading," + query
40
+ seed = abs(hash(str(title) + str(source))) % 10000
41
+ return f"https://loremflickr.com/1200/675/{query}?random={seed}"
42
 
43
  async def _fetch_one(self, session, url):
44
  try:
scraper.py CHANGED
@@ -6,7 +6,7 @@ import ssl
6
  from email.utils import parsedate_to_datetime
7
 
8
  class NewsScraper:
9
- def __init__(self, limit=600):
10
  self.limit = limit
11
  self.ssl_context = ssl.create_default_context()
12
  self.ssl_context.check_hostname = False
@@ -56,36 +56,33 @@ class NewsScraper:
56
  return articles
57
 
58
  def _build_queries(self, ticker):
59
- t = ticker
60
- return [
61
- t, f"{t} stock", f"{t} news", f"{t} market",
62
- f"{t} earnings", f"{t} analyst", f"{t} forecast",
63
- f"{t} price target", f"{t} options", f"{t} technical",
64
- f"{t} dividend", f"{t} industry", f"{t} competitor",
65
- f"{t} share price", f"{t} hedge fund",
66
- f"{t} institutional",
67
- f"{t} buy sell hold", f"{t} upgrade downgrade",
68
- f"{t} outperform underperform",
69
- f"{t} bullish bearish", f"{t} momentum",
70
- f"{t} breakout breakdown", f"{t} rally crash",
71
- f"{t} surge plunge", f"{t} soar tumble",
72
- f"{t} gains losses", f"{t} beat miss expectations",
73
- f"{t} CEO news", f"{t} quarterly results",
74
- f"{t} revenue profit", f"{t} guidance outlook",
75
- f"{t} acquisition merger", f"{t} lawsuit legal SEC",
76
- f"{t} insider trading", f"{t} buyback repurchase",
77
- f"{t} partnership deal", f"{t} product launch",
78
- f"{t} layoffs restructuring", f"{t} expansion growth",
79
- f"{t} wall street", f"{t} analyst rating",
80
- f"{t} price prediction", f"{t} short interest",
81
- f"{t} short squeeze", f"{t} put call ratio",
82
- f"{t} sector outlook", f"{t} industry trend",
83
- f"{t} supply chain", f"{t} regulation policy",
84
- f"{t} inflation impact", f"{t} interest rate",
85
- f"{t} today", f"{t} this week", f"{t} latest",
86
- f"{t} breaking news", f"{t} update",
87
- f"{t} premarket", f"{t} after hours",
88
- ]
89
 
90
  async def scrape(self, ticker, lookback_date, progress_cb=None):
91
  queries = self._build_queries(ticker)
 
6
  from email.utils import parsedate_to_datetime
7
 
8
  class NewsScraper:
9
+ def __init__(self, limit=1000):
10
  self.limit = limit
11
  self.ssl_context = ssl.create_default_context()
12
  self.ssl_context.check_hostname = False
 
56
  return articles
57
 
58
  def _build_queries(self, ticker):
59
+ # Resolve aliases if it's a known ticker to expand search coverage
60
+ aliases = [ticker]
61
+ t_low = ticker.lower()
62
+ if t_low in ["^nsei", "nifty", "nifty 50"]:
63
+ aliases.extend(["nifty 50", "nifty50", "nifty index", "nse india", "nsei stocks", "indian market", "nifty 100", "nifty next 50"])
64
+ elif t_low in ["^bsesn", "sensex"]:
65
+ aliases.extend(["sensex", "bse sensex", "bombay stock exchange", "bse india", "sensex 30"])
66
+ elif t_low in ["^nsebank", "banknifty"]:
67
+ aliases.extend(["bank nifty", "nifty bank", "banknifty", "banking stocks india", "hdfc bank news", "icici bank news"])
68
+
69
+ queries = []
70
+ for a in aliases[:6]: # Use more aliases for broader coverage
71
+ queries.extend([
72
+ a, f"{a} stock", f"{a} news", f"{a} market",
73
+ f"{a} forecast", f"{a} predictions", f"{a} today",
74
+ f"{a} analysis", f"{a} outlook", f"{a} update",
75
+ f"{a} breakout", f"{a} technicals", f"{a} sentiment"
76
+ ])
77
+
78
+ # Add high-yield generic financial terms for context if it's a major index
79
+ if t_low in ["^nsei", "^bsesn", "nifty", "sensex"]:
80
+ queries.extend([
81
+ "indian stock market news", "dalal street updates",
82
+ "rbi policy news", "fpi flows india", "nifty earnings season"
83
+ ])
84
+
85
+ return list(dict.fromkeys(queries)) # Remove duplicates
 
 
 
86
 
87
  async def scrape(self, ticker, lookback_date, progress_cb=None):
88
  queries = self._build_queries(ticker)