import os, time, json, tempfile, datetime, requests, feedparser import re import gradio as gr import pandas as pd import plotly.express as px import numpy as np from concurrent.futures import ThreadPoolExecutor, as_completed # local modules (flat files in the root) from search import get_news from llm import summarize from huggingface_nlp import analyze_sentiment, analyze_entities, extract_keywords from aws import s3_upload, ses_send_email from cache import get_cache, set_cache # ---------------------- Time helpers ---------------------- def ts_now_utc(): return datetime.datetime.now(datetime.timezone.utc) def human_ago(dt_utc): delta = ts_now_utc() - dt_utc s = int(delta.total_seconds()) if s < 60: return f"{s}s ago" m = s // 60 if m < 60: return f"{m}m ago" h = m // 60 return f"{h}h ago" # ---------------------- Presets ---------------------- ONE_CLICK = { "Healthcare AI": {"topic": "Healthcare AI", "query_hint": "hospital AI diagnostics EMR"}, "Drug Discovery": {"topic": "Drug discovery", "query_hint": "clinical trials FDA approvals biotech"}, "Hospital Staffing Trends": {"topic": "Hospital staffing", "query_hint": "nurse shortage hospital layoffs hiring"}, "Finance (Earnings/Stocks)": {"topic": "Earnings season", "query_hint": "earnings guidance revenue EPS"}, "Tech R&D (Patents/AI)": {"topic": "AI research", "query_hint": "foundation models patents transformer LLM"}, "General": {"topic": "", "query_hint": ""} } H1B_TECH_PRESETS = sorted(list({ "Google","Apple","Meta","Amazon","Microsoft","Netflix","NVIDIA","Tesla","Oracle","Salesforce", "IBM","Intel","Qualcomm","Cisco","Adobe","Uber","Airbnb","ServiceNow","Snowflake", "Databricks","OpenAI","Palantir","Zoom","Workday","Stripe","Block","Atlassian","DoorDash", "eBay","LinkedIn","Lyft","Reddit","Shopify","Pinterest","Cloudflare","Twilio","Splunk", "AMD","MongoDB","HashiCorp","GitHub","GitLab","Coinbase","TikTok","Bytedance" })) # ---------------------- Styling ---------------------- SENTI = { "POSITIVE": {"color": "#10b981", "emoji": "🟒"}, "NEGATIVE": {"color": "#ef4444", "emoji": "πŸ”΄"}, "NEUTRAL": {"color": "#f59e0b", "emoji": "🟑"}, "MIXED": {"color": "#06b6d4", "emoji": "πŸ”΅"}, } CSS_BASE = """ :root{ --bg:#f7f8fb; --panel:#ffffff; --text:#0f172a; --muted:#475569; --card:#ffffff; --chip:#eef2ff; --shadow: 0 10px 24px rgba(2,6,23,.08); } * { font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial; } body { background: var(--bg); color: var(--text); } #root, .gradio-container { background: var(--bg); } .container { max-width: 1200px; margin: 0 auto; } .hero { display:flex; align-items:center; gap:14px; margin: 8px 0 18px; } .title { font-weight:800; font-size: 28px; line-height:1.1; } .subtitle { color: var(--muted); font-size:14px; } .grid { display:grid; grid-template-columns: repeat(2, minmax(0,1fr)); gap:14px; } @media (max-width: 900px){ .grid { grid-template-columns: 1fr; } } .card { background: var(--card); padding:16px; border-radius:16px; box-shadow: var(--shadow); transition: transform .12s ease; border: 1px solid #e5e7eb; } .card:hover { transform: translateY(-2px); } .ctitle { font-weight:800; font-size:18px; margin-bottom:6px; } .ctitle a { color:#0f172a; } .ctitle a:hover { text-decoration:underline; } .csummary { font-size:14px; line-height:1.55; margin:8px 0 10px; color:#334155; } .row { display:flex; align-items:center; justify-content:space-between; gap:8px; flex-wrap: wrap; } .badge { padding:4px 10px; border-radius:999px; color:white; font-weight:700; font-size:12px; display:inline-flex; gap:6px; align-items:center; box-shadow: 0 1px 0 rgba(15,23,42,.06); } .tags { display:flex; gap:8px; flex-wrap: wrap; } .tag { background: var(--chip); color: var(--text); opacity:.9; padding:4px 10px; border-radius:999px; font-size:12px; } a { color: #0b5dd7; text-decoration: none; } a:hover { text-decoration: underline; } .small { color: var(--muted); font-size:12px; margin: 6px 0 0; } """ # ---------------------- Render helpers ---------------------- def format_summary_html(s: str) -> str: """Bold the three bullet labels so recruiters can scan quickly.""" if not s: return "" s = re.sub(r'^\s*1[\)\.]\s*', 'What happened: ', s, flags=re.IGNORECASE|re.MULTILINE) s = re.sub(r'^\s*2[\)\.]\s*', 'Business impact: ', s, flags=re.IGNORECASE|re.MULTILINE) s = re.sub(r'^\s*3[\)\.]\s*', 'Risk or opportunity: ', s, flags=re.IGNORECASE|re.MULTILINE) return s def render_cards(rows: list[dict], entity_filter: str | None = None, sentiment_filter: str | None = None) -> str: if entity_filter: rows = [r for r in rows if entity_filter.lower() in (r.get("Entities","").lower())] if sentiment_filter and sentiment_filter != "ALL": rows = [r for r in rows if r.get("Sentiment","").upper() == sentiment_filter] html = [f"
"] if not rows: html.append("No results.") for r in rows: senti = SENTI.get(r["Sentiment"].upper(), SENTI["NEUTRAL"]) badge_style = f"background:{senti['color']}" ents = [e for e in (r.get('Entities') or '').split(', ') if e][:4] tag_html = "".join([f"{x}" for x in ents]) or "No entities" source_html = f"" html.append(f"""
{format_summary_html(r['Summary'])}
{senti['emoji']} {r['Sentiment'].title()}
{tag_html}
{source_html}
""") html.append("
") return "\n".join(html) def make_sentiment_chart(df: pd.DataFrame): if df.empty: return px.bar() counts = df["Sentiment"].value_counts().reindex(["POSITIVE","NEUTRAL","NEGATIVE","MIXED"]).fillna(0).reset_index() counts.columns = ["Sentiment","Count"] fig = px.bar(counts, x="Sentiment", y="Count", text="Count", height=340, title="Sentiment distribution") fig.update_traces(textposition="outside") fig.update_layout(margin=dict(l=10,r=10,t=40,b=10), template="plotly_white", xaxis_title=None, yaxis_title=None) return fig def make_trend_chart(df: pd.DataFrame): if df.empty or "Date" not in df.columns: return px.line() trend = df.copy() trend["Score"] = trend["Sentiment"].map({"POSITIVE":1, "NEUTRAL":0, "NEGATIVE":-1, "MIXED":0}).fillna(0) trend = trend.groupby("Date", as_index=False)["Score"].mean() fig = px.line(trend, x="Date", y="Score", title="Avg sentiment over time (by day)") fig.update_layout(margin=dict(l=10,r=10,t=40,b=10), template="plotly_white", yaxis_range=[-1,1]) return fig def make_forecast_chart(df: pd.DataFrame): """Linear fit on daily average sentiment -> 7-day projection.""" if df.empty or "Date" not in df.columns: return px.line(title="Forecast (insufficient data)") work = df.copy() work["Score"] = work["Sentiment"].map({"POSITIVE":1,"NEUTRAL":0,"NEGATIVE":-1,"MIXED":0}).fillna(0) daily = work.groupby("Date", as_index=False)["Score"].mean().sort_values("Date") if len(daily) < 3: return px.line(daily, x="Date", y="Score", title="Forecast (needs β‰₯3 days)", template="plotly_white") x = pd.to_datetime(daily["Date"]).map(pd.Timestamp.toordinal).to_numpy() y = daily["Score"].to_numpy() a, b = np.polyfit(x, y, 1) last_day = pd.to_datetime(daily["Date"]).max() fut_dates = [last_day + pd.Timedelta(days=i) for i in range(1,8)] x_future = np.array([d.toordinal() for d in fut_dates]) y_future = a * x_future + b base = px.line(daily, x="Date", y="Score", title="Sentiment: history & 7-day linear forecast", markers=True) fut = pd.DataFrame({"Date": fut_dates, "Score": y_future}) base.add_scatter(x=fut["Date"], y=fut["Score"], mode="lines+markers", name="Forecast") base.update_layout(margin=dict(l=10,r=10,t=40,b=10), template="plotly_white", yaxis_range=[-1,1]) return base # ---------------------- Extra sources (simple + free) ---------------------- HEADERS = {"User-Agent": "NewsIntel/1.0"} def fetch_press_releases(topic: str, limit: int = 5): q = requests.utils.quote(f"{topic} press release") url = f"https://news.google.com/rss/search?q={q}" feed = feedparser.parse(url) return [{"title":e.get("title",""),"link":e.get("link","")} for e in feed.entries[:limit]] def _try_greenhouse(board: str): api = f"https://boards-api.greenhouse.io/v1/boards/{board}/jobs" try: r = requests.get(api, timeout=20, headers=HEADERS) if r.status_code == 200: data = r.json().get("jobs", []) return [{"title": j.get("title",""), "location": (j.get("location") or {}).get("name",""), "url": j.get("absolute_url","")} for j in data] except Exception: pass return [] def _try_lever(board: str): api = f"https://api.lever.co/v0/postings/{board}?mode=json" try: r = requests.get(api, timeout=20, headers=HEADERS) if r.status_code == 200: data = r.json() return [{"title": j.get("text",""), "location": j.get("categories",{}).get("location",""), "url": j.get("hostedUrl","")} for j in data] except Exception: pass return [] def fetch_jobs(topic: str, limit: int = 8): board_guess = topic.lower().replace(" ", "") jobs = _try_greenhouse(board_guess) or _try_lever(board_guess) return jobs[:limit] def grounded_summary(news_text: str, context: str = "") -> str: prompt = ( "You are an analyst. Ground your bullets ONLY in the provided context. " "If the context is insufficient, say 'insufficient context'.\n\n" f"Context:\n{(context or '').strip()}\n\n" "Task: Summarize the following news into 3 bullets:\n" "1) What happened 2) Business impact 3) Risk or opportunity\n" "Limit ~90 words.\n\n" f"News:\n{news_text}" ) return summarize(prompt) def make_briefing(topic: str, rows: list[dict], press: list[dict], jobs: list[dict], timestamp_str: str) -> str: news_bits = "\n".join([f"- {r['Title']} ({r['URL']})" for r in rows[:6]]) or "β€”" press_bits = "\n".join([f"- {p['title']} ({p['link']})" for p in press[:5]]) or "β€”" jobs_bits = "\n".join([f"- {j['title']} ({j.get('location','')}) β€” {j['url']}" for j in jobs[:5]]) or "No jobs found." prompt = ( f"You are preparing an interview briefing about '{topic}'. " "Synthesize:\n" f"Recent news:\n{news_bits}\n\n" f"Press releases:\n{press_bits}\n\n" f"Open roles snapshot:\n{jobs_bits}\n\n" "Output:\n- 3 bullets: momentum (facts)\n- 3 bullets: risks\n- 3 bullets: opportunities\n" "- 3 bullets: interview talking points with 1–2 citations.\nKeep it under 220 words.\n" f"(Data last updated: {timestamp_str})" ) return summarize(prompt) # ---------------------- Core pipeline with caching ---------------------- def agentic_get_news(topic: str, days: int, k: int, query_hint: str = ""): cache_obj = get_cache("news", topic, days, k, query_hint) if cache_obj: return cache_obj query = " OR ".join([ f"{topic} {query_hint}".strip(), f'"{topic}" AND {query_hint}'.strip(), f"{topic} AI", ]) res = get_news(query, days, k) if len(res) < k: res = get_news(query, min(days + 7, 30), k) set_cache(res, "news", topic, days, k, query_hint) return res def cached_press(topic: str): c = get_cache("press", topic) if c: return c p = fetch_press_releases(topic, 6) set_cache(p, "press", topic) return p def cached_jobs(topic: str): c = get_cache("jobs", topic) if c: return c j = fetch_jobs(topic, 8) set_cache(j, "jobs", topic) return j def run_pipeline(topic, days, k, query_hint="", fast=True): articles = agentic_get_news(topic, int(days), int(k), query_hint=query_hint) press = cached_press(topic) jobs = cached_jobs(topic) today = datetime.date.today() rows, metrics = [], [] def _process(a): base_text = f"{a['title']} β€” {a['snippet']}" t0 = time.time() summary = grounded_summary(base_text, context=a.get("snippet","")) latency = time.time() - t0 sent = analyze_sentiment(summary) ents = [] if fast else analyze_entities(summary) kws = [] if fast else extract_keywords(summary, top_n=6) row = { "Title": a["title"], "URL": a["url"], "Summary": summary, "Sentiment": sent["label"].upper(), "Entities": "" if fast else ", ".join({e["word"] for e in ents[:6]}), "Key Phrases": "" if fast else ", ".join({k["keyword"] for k in kws[:6]}), "Date": a.get("published_date") or today, } met = { "title": a["title"], "latency_sec": round(latency,3), "summary_tokens": len(summary.split()), "sentiment": sent["label"].upper(), "entity_count": 0 if fast else len(ents) } return row, met with ThreadPoolExecutor(max_workers=min(4, max(1, k))) as ex: futures = [ex.submit(_process, a) for a in articles] for fut in as_completed(futures): r, m = fut.result() rows.append(r); metrics.append(m) rows.sort(key=lambda x: x["Title"]) metrics.sort(key=lambda x: x["title"]) df = pd.DataFrame(rows) mdf = pd.DataFrame(metrics) now = ts_now_utc() timestamp_str = f"{now.strftime('%b %d, %Y %I:%M %p')} UTC β€’ {human_ago(now)}" briefing = make_briefing(topic, rows, press, jobs, timestamp_str) rollup = pd.DataFrame([{ "articles": len(rows), "jobs_found": len(jobs), "press_releases": len(press), "avg_latency_sec": round(mdf["latency_sec"].mean(),3) if not mdf.empty else 0.0, "updated_at": timestamp_str }]) return rows, df, mdf, rollup, briefing, press, jobs, timestamp_str # ---------------------- Exporters (with branding) ---------------------- def export_briefing_html(topic: str, briefing_md: str, timestamp_str: str): html = f""" {topic} β€” Briefing

{topic} β€” Interview Briefing

Data last updated: {timestamp_str}
{briefing_md}

Generated by NewsIntel Agent β€” Hasitha Varada
""" path = os.path.join(tempfile.gettempdir(), f"{topic}_briefing.html") with open(path, "w", encoding="utf-8") as f: f.write(html) return path try: import reportlab # optional HAS_PDF = True except Exception: HAS_PDF = False def export_briefing_pdf(topic: str, briefing_md: str, timestamp_str: str): if not HAS_PDF: return None from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas from reportlab.lib.units import inch path = os.path.join(tempfile.gettempdir(), f"{topic}_briefing.pdf") c = canvas.Canvas(path, pagesize=letter) width, height = letter x, y = 0.75*inch, height - 1*inch c.setFont("Helvetica-Bold", 14); c.drawString(x, y, f"{topic} β€” Interview Briefing") c.setFont("Helvetica", 9); y -= 0.25*inch; c.drawString(x, y, f"Data last updated: {timestamp_str}") c.setFont("Times-Roman", 11); y -= 0.35*inch for line in briefing_md.splitlines(): if not line.strip(): y -= 0.18*inch; continue c.drawString(x, y, line[:115]); y -= 0.18*inch if y < 1*inch: c.setFont("Helvetica", 9) c.drawString(x, 0.7*inch, "Generated by NewsIntel Agent β€” Hasitha Varada") c.showPage() x, y = 0.75*inch, height - 1*inch c.setFont("Times-Roman", 11) c.setFont("Helvetica", 9) c.drawString(x, 0.7*inch, "Generated by NewsIntel Agent β€” Hasitha Varada") c.save() return path # ---------------------- Gradio callbacks ---------------------- def estimate_eta_secs(k:int, fast:bool) -> int: base = 1 if fast else 3 overhead = 2 return max(3, base * int(k) + overhead) def start_banner(k, fast_mode): eta = estimate_eta_secs(k, bool(fast_mode)) return f"⏳ Running analysis (~{eta}s). Models are warm-started; first run may take longer..." def analyze_news(mode, preset_company, topic, days, k, entity_filter, sentiment_filter, fast_mode): query_hint = ONE_CLICK.get(mode, ONE_CLICK["General"])["query_hint"] if mode in ONE_CLICK else "" if preset_company and preset_company.lower() not in (topic or "").lower(): topic = f"{topic} {preset_company}".strip() rows, df, mdf, rollup, briefing, press, jobs, ts = run_pipeline( topic, days, k, query_hint=query_hint, fast=bool(fast_mode) ) cards_html = render_cards(rows, entity_filter or None, sentiment_filter or None) all_ents = sorted(set( e.strip() for r in rows for e in (r.get("Entities", "").split(", ")) if e.strip() ))[:50] header = f"πŸ—žοΈ NewsIntel β€” Data last updated: {ts}" return ( header, cards_html, make_sentiment_chart(df), make_trend_chart(df), make_forecast_chart(df), df, mdf if not mdf.empty else pd.DataFrame([{"note": "No per-article metrics yet"}]), rollup, briefing, gr.update(choices=all_ents) ) def export_cb(topic, briefing_md, timestamp_str): html_path = export_briefing_html(topic, briefing_md, timestamp_str) pdf_path = export_briefing_pdf(topic, briefing_md, timestamp_str) html_url = s3_upload(html_path) pdf_url = s3_upload(pdf_path) if pdf_path else None links = [] if html_url: links.append(f"View HTML on S3") if pdf_url: links.append(f"View PDF on S3") links_html = "
".join(links) if links else "(S3 links will appear here if configured)" return html_path, (pdf_path or None), links_html def email_weekly_cb(topic, email, briefing_md, timestamp_str): if not email: return "Enter your email first." if not (briefing_md or "").strip(): return "No briefing yet β€” run analysis first." ok = ses_send_email( email, f"Weekly Briefing – {topic}", f"

{topic} – Weekly Briefing

" f"
Data last updated: {timestamp_str}
" f"
{briefing_md}
" ) return "Email sent via SES βœ…" if ok else "SES not configured or send failed ❌" # ---------------------- UI ---------------------- with gr.Blocks(theme=gr.themes.Soft(primary_hue="violet", neutral_hue="slate")) as demo: gr.HTML(f"""
πŸ—žοΈ
NewsIntel Agent β€” Job Briefings & Hiring Signals
One‑click modes Cached results Branded HTML/PDF Optional S3/SES
""") # ---------- inputs ---------- with gr.Row(): mode = gr.Dropdown(choices=list(ONE_CLICK.keys()), value="General", label="One-Click Mode") preset_company = gr.Dropdown(choices=H1B_TECH_PRESETS, label="Company Presets (H-1B Tech)", allow_custom_value=True) topic = gr.Textbox(label="Topic / Company", value="", placeholder="e.g., AMD, Healthcare AI, EV market India") days = gr.Slider(1, 30, value=7, step=1, label="Lookback (days)") k = gr.Slider(3, 12, value=4, step=1, label="Articles") fast_mode = gr.Checkbox(value=True, label="⚑ Fast mode (skip Entities & Key Phrases)") with gr.Row(): entity_filter = gr.Dropdown(choices=[], label="Filter by Mentioned Company/Person", value=None) sentiment_filter = gr.Dropdown(choices=["ALL","POSITIVE","NEUTRAL","NEGATIVE","MIXED"], value="ALL", label="Sentiment filter") run_btn = gr.Button("Run Analysis", variant="primary") # ---------- outputs ---------- header_bar = gr.Markdown(value="πŸ—žοΈ NewsIntel β€” Data last updated: β€”") with gr.Tab("Insights"): tip_md = gr.Markdown("πŸ’‘ **Tip:** *Entities* are detected names of companies/people/places (e.g., β€œTSMC”, β€œARM”). Use the filters to focus the feed.") cards = gr.HTML() with gr.Tab("Charts"): plot_sent = gr.Plot(label="Sentiment distribution") plot_trend = gr.Plot(label="Trend (avg sentiment by day)") with gr.Tab("Forecast"): gr.Markdown("ℹ️ *The forecast projects the **average daily sentiment** trend 7 days ahead using a simple linear fit. It’s a quick momentum signal, not a trading model.*") plot_forecast = gr.Plot(label="7-day sentiment forecast") with gr.Tab("Table"): table = gr.Dataframe(wrap=True) with gr.Tab("Metrics"): per_article = gr.Dataframe(wrap=True, label="Per-article metrics") rollup = gr.Dataframe(wrap=True, label="Run summary") with gr.Tab("Briefing"): briefing_md = gr.Markdown() timestamp_str = gr.Textbox(label="Timestamp", interactive=False) export_html = gr.File(label="Download HTML") export_pdf = gr.File(label="Download PDF (optional)") s3_links = gr.HTML(value="(S3 links will appear here if configured)") export_btn = gr.Button("Export Briefing (creates files)") with gr.Row(): weekly_email = gr.Textbox(label="Email (SES)", placeholder="name@example.com") email_btn = gr.Button("Email Weekly Briefing (SES)") email_status = gr.Markdown() # ---------- helpers & wiring ---------- def _apply_mode(m, current_topic): cfg = ONE_CLICK.get(m, ONE_CLICK["General"]) return gr.update(value=current_topic or cfg.get("topic","")) mode.change(_apply_mode, inputs=[mode, topic], outputs=[topic]) preset_company.change(lambda x: x or "", inputs=preset_company, outputs=topic) # Show banner -> run analysis -> stamp timestamp box run_btn.click( start_banner, inputs=[k, fast_mode], outputs=[header_bar] ).then( analyze_news, inputs=[mode, preset_company, topic, days, k, entity_filter, sentiment_filter, fast_mode], outputs=[header_bar, cards, plot_sent, plot_trend, plot_forecast, table, per_article, rollup, briefing_md, entity_filter] ).then( lambda: datetime.datetime.now().strftime("%b %d, %Y %I:%M %p"), inputs=[], outputs=[timestamp_str] ) export_btn.click(export_cb, inputs=[topic, briefing_md, timestamp_str], outputs=[export_html, export_pdf, s3_links]) email_btn.click(email_weekly_cb, inputs=[topic, weekly_email, briefing_md, timestamp_str], outputs=[email_status]) if __name__ == "__main__": print("πŸš€ Launching NewsIntel (light-only UI + caching + one-click modes + forecast)") demo.launch()