Upload 3 files
Browse files- app.py +2 -2
- extractor.py +8 -3
- scraper.py +28 -31
app.py
CHANGED
|
@@ -63,13 +63,13 @@ def _run_training_job():
|
|
| 63 |
|
| 64 |
_load_model_from_disk()
|
| 65 |
|
| 66 |
-
async def fetch_and_predict(ticker="^NSEI", days_back=
|
| 67 |
with MODEL_LOCK:
|
| 68 |
current_model = model
|
| 69 |
if not current_model:
|
| 70 |
return {"error": "Model not loaded. Please train the model first."}
|
| 71 |
|
| 72 |
-
scraper = NewsScraper(limit=
|
| 73 |
extractor = ContentExtractor()
|
| 74 |
features = Features(ticker)
|
| 75 |
|
|
|
|
| 63 |
|
| 64 |
_load_model_from_disk()
|
| 65 |
|
| 66 |
+
async def fetch_and_predict(ticker="^NSEI", days_back=7):
|
| 67 |
with MODEL_LOCK:
|
| 68 |
current_model = model
|
| 69 |
if not current_model:
|
| 70 |
return {"error": "Model not loaded. Please train the model first."}
|
| 71 |
|
| 72 |
+
scraper = NewsScraper(limit=450) # Fetch 450+ headlines for the ML model
|
| 73 |
extractor = ContentExtractor()
|
| 74 |
features = Features(ticker)
|
| 75 |
|
extractor.py
CHANGED
|
@@ -31,9 +31,14 @@ class ContentExtractor:
|
|
| 31 |
parts.extend(words[:3])
|
| 32 |
|
| 33 |
# Use loremflickr which is a reliable replacement for keyword-based placeholders.
|
| 34 |
-
# Format: https://loremflickr.com/1200/675/keyword1,keyword2
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
async def _fetch_one(self, session, url):
|
| 39 |
try:
|
|
|
|
| 31 |
parts.extend(words[:3])
|
| 32 |
|
| 33 |
# Use loremflickr which is a reliable replacement for keyword-based placeholders.
|
| 34 |
+
# Format: https://loremflickr.com/1200/675/keyword1,keyword2?random=N
|
| 35 |
+
# We append a hash of the title as a 'random' seed to ensure uniqueness for different articles,
|
| 36 |
+
# but consistency for recovery if the page is refreshed.
|
| 37 |
+
query = ",".join(parts[:5]).strip() or "finance,stock,market"
|
| 38 |
+
# Always mix in a professional keyword to override potential cat/statue fallbacks
|
| 39 |
+
query = "business,trading," + query
|
| 40 |
+
seed = abs(hash(str(title) + str(source))) % 10000
|
| 41 |
+
return f"https://loremflickr.com/1200/675/{query}?random={seed}"
|
| 42 |
|
| 43 |
async def _fetch_one(self, session, url):
|
| 44 |
try:
|
scraper.py
CHANGED
|
@@ -6,7 +6,7 @@ import ssl
|
|
| 6 |
from email.utils import parsedate_to_datetime
|
| 7 |
|
| 8 |
class NewsScraper:
|
| 9 |
-
def __init__(self, limit=
|
| 10 |
self.limit = limit
|
| 11 |
self.ssl_context = ssl.create_default_context()
|
| 12 |
self.ssl_context.check_hostname = False
|
|
@@ -56,36 +56,33 @@ class NewsScraper:
|
|
| 56 |
return articles
|
| 57 |
|
| 58 |
def _build_queries(self, ticker):
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
f"{t} breaking news", f"{t} update",
|
| 87 |
-
f"{t} premarket", f"{t} after hours",
|
| 88 |
-
]
|
| 89 |
|
| 90 |
async def scrape(self, ticker, lookback_date, progress_cb=None):
|
| 91 |
queries = self._build_queries(ticker)
|
|
|
|
| 6 |
from email.utils import parsedate_to_datetime
|
| 7 |
|
| 8 |
class NewsScraper:
|
| 9 |
+
def __init__(self, limit=1000):
|
| 10 |
self.limit = limit
|
| 11 |
self.ssl_context = ssl.create_default_context()
|
| 12 |
self.ssl_context.check_hostname = False
|
|
|
|
| 56 |
return articles
|
| 57 |
|
| 58 |
def _build_queries(self, ticker):
|
| 59 |
+
# Resolve aliases if it's a known ticker to expand search coverage
|
| 60 |
+
aliases = [ticker]
|
| 61 |
+
t_low = ticker.lower()
|
| 62 |
+
if t_low in ["^nsei", "nifty", "nifty 50"]:
|
| 63 |
+
aliases.extend(["nifty 50", "nifty50", "nifty index", "nse india", "nsei stocks", "indian market", "nifty 100", "nifty next 50"])
|
| 64 |
+
elif t_low in ["^bsesn", "sensex"]:
|
| 65 |
+
aliases.extend(["sensex", "bse sensex", "bombay stock exchange", "bse india", "sensex 30"])
|
| 66 |
+
elif t_low in ["^nsebank", "banknifty"]:
|
| 67 |
+
aliases.extend(["bank nifty", "nifty bank", "banknifty", "banking stocks india", "hdfc bank news", "icici bank news"])
|
| 68 |
+
|
| 69 |
+
queries = []
|
| 70 |
+
for a in aliases[:6]: # Use more aliases for broader coverage
|
| 71 |
+
queries.extend([
|
| 72 |
+
a, f"{a} stock", f"{a} news", f"{a} market",
|
| 73 |
+
f"{a} forecast", f"{a} predictions", f"{a} today",
|
| 74 |
+
f"{a} analysis", f"{a} outlook", f"{a} update",
|
| 75 |
+
f"{a} breakout", f"{a} technicals", f"{a} sentiment"
|
| 76 |
+
])
|
| 77 |
+
|
| 78 |
+
# Add high-yield generic financial terms for context if it's a major index
|
| 79 |
+
if t_low in ["^nsei", "^bsesn", "nifty", "sensex"]:
|
| 80 |
+
queries.extend([
|
| 81 |
+
"indian stock market news", "dalal street updates",
|
| 82 |
+
"rbi policy news", "fpi flows india", "nifty earnings season"
|
| 83 |
+
])
|
| 84 |
+
|
| 85 |
+
return list(dict.fromkeys(queries)) # Remove duplicates
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
async def scrape(self, ticker, lookback_date, progress_cb=None):
|
| 88 |
queries = self._build_queries(ticker)
|