Spaces:
Sleeping
Sleeping
| import os | |
| os.environ["OMP_NUM_THREADS"] = "1" | |
| os.environ["OPENBLAS_NUM_THREADS"] = "1" | |
| os.environ["MKL_NUM_THREADS"] = "1" | |
| os.environ["NUMEXPR_NUM_THREADS"] = "1" | |
| import json | |
| import time | |
| import gradio as gr | |
| import torch | |
| from peft import PeftModel | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| from openai import OpenAI | |
| from googleapiclient.discovery import build | |
| from huggingface_hub import login | |
| # ============================================================ | |
| # CUDA / GPU Check | |
| # ============================================================ | |
| import subprocess | |
| print("Is CUDA available:", torch.cuda.is_available()) | |
| print("Torch CUDA build:", torch.version.cuda) | |
| try: | |
| out = subprocess.check_output(["nvidia-smi"], text=True) | |
| print("nvidia-smi OK:\n", out[:800]) | |
| except Exception as e: | |
| print("nvidia-smi FAILED:", repr(e)) | |
| # ============================================================ | |
| # Login to HuggingFace (for private model access) | |
| # ============================================================ | |
| HF_TOKEN = os.environ.get("HF_TOKEN") | |
| if HF_TOKEN: | |
| login(token=HF_TOKEN) | |
| # ============================================================ | |
| # Initialize API clients | |
| # ============================================================ | |
| openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) | |
| google_api_key = os.environ.get("GOOGLE_API_KEY") | |
| google_cse_id = os.environ.get("GOOGLE_CSE_ID") | |
| # ============================================================ | |
| # Model Configuration | |
| # ============================================================ | |
| BASE_MODEL = "meta-llama/Llama-3.1-8B-Instruct" | |
| LORA_MODEL = "nidhipandya/nidhi-llama-lora" | |
| # ============================================================ | |
| # Load Model | |
| # ============================================================ | |
| print("Loading NewsScope model...") | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_compute_dtype=torch.float16, | |
| ) | |
| tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True) | |
| tok.pad_token = tok.eos_token | |
| tok.padding_side = "left" | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| device_map="auto", | |
| torch_dtype=torch.float16, | |
| quantization_config=bnb_config, | |
| low_cpu_mem_usage=True, | |
| ) | |
| base_model = PeftModel.from_pretrained(base_model, LORA_MODEL) | |
| base_model.eval() | |
| print("NewsScope model loaded successfully") | |
| # ============================================================ | |
| # System Prompt (Original VERA prompt - works with your LoRA) | |
| # ============================================================ | |
| SYSTEM_PROMPT = """You are NewsScope, a neutral news analyst who produces structured summaries. | |
| You will be given a single news article with: | |
| - TITLE | |
| - ANNOTATED timestamp | |
| - Full article text | |
| You MUST reply ONLY with this exact structure: | |
| TITLE: [exactly as in article] | |
| ANNOTATED: [exactly as given] | |
| HEADLINE | |
| [1 short, neutral headline sentence] | |
| SUMMARY (Key Points) | |
| [1-3 short PARAGRAPHS in plain text. NO bullet points. NO numbered lists.] | |
| IN SIMPLE TERMS (ELI12) | |
| [2-3 sentences explaining the story simply] | |
| WHO IS INVOLVED? | |
| [Bullet list of people/institutions from article only] | |
| HOW THE STORY UNFOLDED | |
| [Short chronological narrative of events from article only.] | |
| VERIFIED CLAIMS & EVIDENCE | |
| Central Claim: [one main claim from article] | |
| Evidence: [facts, numbers, quotes from article only] | |
| Why it matters: [1-3 sentences using article info only] | |
| Confidence: High/Medium/Low | |
| CRITICAL RULES: | |
| - Use ONLY information in the article | |
| - Do NOT invent specific years, dates, or numbers | |
| - Do NOT add organizations not explicitly mentioned | |
| - Keep neutral, analytical tone""" | |
| # ============================================================ | |
| # Helper Functions | |
| # ============================================================ | |
| def get_trust_level(url): | |
| url_lower = url.lower() | |
| high_trust = [".gov", "congress.gov", "senate.gov", "whitehouse.gov", ".edu", | |
| "apnews.com", "reuters.com", "bbc.com", "bbc.co.uk", "cdc.gov", | |
| "nih.gov", "who.int", "nasa.gov", "noaa.gov"] | |
| medium_trust = ["nytimes.com", "washingtonpost.com", "wsj.com", "theguardian.com", | |
| "economist.com", "npr.org", "pbs.org", "nature.com", "sciencemag.org", | |
| "bloomberg.com"] | |
| for domain in high_trust: | |
| if domain in url_lower: | |
| return "HIGH" | |
| for domain in medium_trust: | |
| if domain in url_lower: | |
| return "MEDIUM" | |
| return "LOW" | |
| def search_claim(claim, max_results=3): | |
| try: | |
| service = build("customsearch", "v1", developerKey=google_api_key) | |
| result = service.cse().list(q=claim, cx=google_cse_id, num=max_results).execute() | |
| items = result.get("items", []) | |
| filtered = [] | |
| for item in items: | |
| url = item.get("link", "") | |
| trust = get_trust_level(url) | |
| if trust in ["HIGH", "MEDIUM"]: | |
| filtered.append({ | |
| "title": item.get("title", ""), | |
| "url": url, | |
| "snippet": item.get("snippet", ""), | |
| "trust_level": trust | |
| }) | |
| return filtered | |
| except Exception as e: | |
| print(f"Search error: {e}") | |
| return [] | |
| def extract_claims(article_text, max_claims=3): | |
| prompt = f"""Extract {max_claims} specific, verifiable factual claims from this article. | |
| Article: | |
| {article_text} | |
| Return ONLY a JSON array of strings. Example: | |
| ["The Senate passed legislation by 51-49", "The bill allocates $500 billion"] | |
| Focus on specific numbers, dates, votes, concrete policy details.""" | |
| try: | |
| response = openai_client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[ | |
| {"role": "system", "content": "You extract verifiable claims from news articles. Return only JSON."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0.3, | |
| max_tokens=200 | |
| ) | |
| content = response.choices[0].message.content.strip() | |
| if content.startswith("```json"): | |
| content = content.replace("```json", "").replace("```", "").strip() | |
| claims = json.loads(content) | |
| return claims[:max_claims] | |
| except Exception as e: | |
| print(f"Extraction failed: {e}") | |
| return [] | |
| def fact_check_claim(claim, max_sources=3): | |
| sources = search_claim(claim, max_results=max_sources) | |
| if not sources: | |
| return { | |
| "claim": claim, | |
| "status": "UNCLEAR", | |
| "confidence": "LOW", | |
| "summary": "No high-trust sources found to verify this claim.", | |
| "sources": [] | |
| } | |
| sources_text = "\n\n".join([ | |
| f"Source {i+1} ({s['trust_level']}): {s['title']}\n{s['snippet']}" | |
| for i, s in enumerate(sources) | |
| ]) | |
| synthesis_prompt = f"""Based on these sources, evaluate this claim: | |
| Claim: {claim} | |
| Sources: | |
| {sources_text} | |
| Respond with JSON: | |
| {{"status": "SUPPORTED" | "CONTRADICTED" | "MIXED" | "UNCLEAR", "confidence": "HIGH" | "MEDIUM" | "LOW", "summary": "One sentence explaining what sources say"}}""" | |
| try: | |
| response = openai_client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[ | |
| {"role": "system", "content": "You verify claims against sources. Return only JSON."}, | |
| {"role": "user", "content": synthesis_prompt} | |
| ], | |
| temperature=0.3, | |
| max_tokens=150 | |
| ) | |
| content = response.choices[0].message.content.strip() | |
| if content.startswith("```json"): | |
| content = content.replace("```json", "").replace("```", "").strip() | |
| verdict = json.loads(content) | |
| return { | |
| "claim": claim, | |
| "status": verdict.get("status", "UNCLEAR"), | |
| "confidence": verdict.get("confidence", "LOW"), | |
| "summary": verdict.get("summary", ""), | |
| "sources": sources | |
| } | |
| except Exception as e: | |
| return { | |
| "claim": claim, | |
| "status": "ERROR", | |
| "confidence": "LOW", | |
| "summary": f"Error during synthesis: {str(e)}", | |
| "sources": sources | |
| } | |
| def fact_check_article(article_text, max_claims=3, sources_per_claim=3): | |
| claims = extract_claims(article_text, max_claims=max_claims) | |
| if not claims: | |
| return [] | |
| results = [] | |
| for claim in claims: | |
| result = fact_check_claim(claim, max_sources=sources_per_claim) | |
| results.append(result) | |
| time.sleep(1) | |
| return results | |
| def generate_implications(fact_results): | |
| """Generate implications based on fact-check results""" | |
| if not fact_results: | |
| return "" | |
| claims_summary = "\n".join([ | |
| f"- {r['claim'][:100]}: {r['status']}" for r in fact_results | |
| ]) | |
| prompt = f"""Based on these fact-checked claims and their verdicts, write exactly 3 bullet points about implications. | |
| Each bullet must be exactly 1 sentence. | |
| Claims: | |
| {claims_summary} | |
| Write 3 implications covering: | |
| 1. Public perception impact | |
| 2. Policy substance | |
| 3. Information quality | |
| Format as: | |
| - Public perception: [one sentence] | |
| - Policy substance: [one sentence] | |
| - Information quality: [one sentence]""" | |
| try: | |
| response = openai_client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[ | |
| {"role": "system", "content": "You analyze implications of fact-checked news claims. Be concise."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0.5, | |
| max_tokens=200 | |
| ) | |
| return response.choices[0].message.content.strip() | |
| except Exception as e: | |
| print(f"Implications generation failed: {e}") | |
| return "" | |
| # ============================================================ | |
| # Generate NewsScope Summary | |
| # ============================================================ | |
| def generate_newsscope_summary(article_text): | |
| lines = article_text.splitlines() | |
| title_line = lines[0].strip() if len(lines) > 0 else "TITLE: Unknown" | |
| annotated_line = lines[1].strip() if len(lines) > 1 else "ANNOTATED: Unknown" | |
| messages = [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": article_text} | |
| ] | |
| prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| inputs = tok(prompt, return_tensors="pt").to(base_model.device) | |
| with torch.no_grad(): | |
| outputs = base_model.generate( | |
| **inputs, | |
| max_new_tokens=700, | |
| do_sample=True, | |
| temperature=0.3, | |
| top_p=0.9, | |
| pad_token_id=tok.pad_token_id, | |
| eos_token_id=tok.eos_token_id | |
| ) | |
| generated_ids = outputs[0][inputs["input_ids"].shape[-1]:] | |
| raw = tok.decode(generated_ids, skip_special_tokens=True).strip() | |
| # Clean up any system artifacts | |
| clean_lines = [] | |
| for line in raw.splitlines(): | |
| if "system=" in line.lower(): | |
| continue | |
| if "Identifier Separator" in line: | |
| continue | |
| if "Manual separation" in line: | |
| continue | |
| clean_lines.append(line) | |
| raw = "\n".join(clean_lines).strip() | |
| if not raw.startswith("TITLE:"): | |
| raw = f"{title_line}\n{annotated_line}\n\n" + raw | |
| return raw | |
| # ============================================================ | |
| # Enrich Output with Fact-Checks (Production Version) | |
| # ============================================================ | |
| def enrich_newsscope_output(newsscope_summary, fact_check_results, domain="Politics"): | |
| from urllib.parse import urlparse | |
| if not fact_check_results: | |
| return newsscope_summary | |
| # Build the enriched fact-check section | |
| lines = [] | |
| lines.append("") | |
| lines.append("=" * 70) | |
| lines.append("VERIFIED CLAIMS & EVIDENCE (WITH WEB FACT-CHECK)") | |
| lines.append("=" * 70) | |
| lines.append("") | |
| # Quick verdicts | |
| lines.append("Quick verdicts") | |
| for result in fact_check_results[:3]: | |
| claim_text = result.get("claim", "").strip() | |
| words = claim_text.split() | |
| claim_short = " ".join(words[:6]) | |
| if len(words) > 6: | |
| claim_short += "..." | |
| status = result.get("status", "UNCLEAR").upper() | |
| if status == "SUPPORTED": | |
| icon = "[OK]" | |
| elif status == "CONTRADICTED": | |
| icon = "[X]" | |
| else: | |
| icon = "[?]" | |
| lines.append(f"- {claim_short}: {icon} {status.capitalize()}") | |
| lines.append("") | |
| lines.append("-" * 70) | |
| lines.append("") | |
| # Detailed claims with sources | |
| for idx, result in enumerate(fact_check_results[:3], 1): | |
| claim_text = result.get("claim", "").strip() | |
| if len(claim_text) > 200: | |
| claim_text = claim_text[:197] + "..." | |
| status = result.get("status", "UNCLEAR").upper() | |
| confidence = result.get("confidence", "LOW").upper() | |
| summary = result.get("summary", "").strip().replace("\n", " ") | |
| if status == "SUPPORTED": | |
| icon = "[OK]" | |
| elif status == "CONTRADICTED": | |
| icon = "[X]" | |
| else: | |
| icon = "[?]" | |
| lines.append(f"CLAIM {idx}") | |
| lines.append(claim_text) | |
| lines.append("") | |
| lines.append("Evidence (from article)") | |
| lines.append("As stated in the article above.") | |
| lines.append("") | |
| lines.append("Fact check (web)") | |
| lines.append(f"{icon} {status} -- Confidence: {confidence}. {summary}") | |
| lines.append("") | |
| # Add source with clickable link format | |
| sources = result.get("sources", []) | |
| if sources: | |
| trust_order = {"HIGH": 3, "MEDIUM": 2, "LOW": 1} | |
| best_source = max(sources, key=lambda s: trust_order.get(s.get("trust_level", "LOW"), 0)) | |
| title = best_source.get("title", "").strip() | |
| url = best_source.get("url", "").strip() | |
| # Extract domain for display | |
| try: | |
| domain_name = urlparse(url).netloc.replace("www.", "") | |
| except: | |
| domain_name = "source" | |
| lines.append("Source") | |
| lines.append(f"- [{title} ({domain_name})]({url})") | |
| else: | |
| lines.append("Source") | |
| lines.append("- No high-trust sources found for this claim.") | |
| lines.append("") | |
| lines.append("-" * 70) | |
| lines.append("") | |
| # Generate and add implications | |
| implications = generate_implications(fact_check_results) | |
| if implications: | |
| lines.append("") | |
| lines.append("=" * 70) | |
| lines.append("IMPLICATIONS (BASED ON ARTICLE + WEB SOURCES)") | |
| lines.append("=" * 70) | |
| lines.append(implications) | |
| lines.append("") | |
| new_section_text = "\n".join(lines) | |
| # Replace old VERIFIED section or append | |
| header = "VERIFIED CLAIMS & EVIDENCE" | |
| if header in newsscope_summary: | |
| before, _, _ = newsscope_summary.partition(header) | |
| enriched = before.rstrip() + "\n" + new_section_text | |
| else: | |
| enriched = newsscope_summary.rstrip() + "\n" + new_section_text | |
| return enriched | |
| # ============================================================ | |
| # Main NewsScope API Function | |
| # ============================================================ | |
| def newsscope_api(domain, article_text): | |
| if not article_text or len(article_text.strip()) < 50: | |
| return "ERROR: Please enter an article with at least 50 characters." | |
| try: | |
| # Step 1: Generate summary | |
| print("Generating NewsScope summary...") | |
| newsscope_output = generate_newsscope_summary(article_text) | |
| # Step 2: Fact-check claims | |
| print("Fact-checking claims with web search...") | |
| fact_results = fact_check_article(article_text, max_claims=3, sources_per_claim=3) | |
| # Step 3: Enrich with fact-checks | |
| print("Enriching output with fact-checks...") | |
| enriched_output = enrich_newsscope_output(newsscope_output, fact_results, domain) | |
| # Final formatted output | |
| output = f""" | |
| ====================================================================== | |
| NEWSSCOPE ANALYSIS -- Domain: {domain} | |
| ====================================================================== | |
| {enriched_output} | |
| ====================================================================== | |
| ANALYSIS COMPLETE | |
| ====================================================================== | |
| """ | |
| return output | |
| except Exception as e: | |
| import traceback | |
| error_details = traceback.format_exc() | |
| print(f"Error: {error_details}") | |
| return f"ERROR: {str(e)}\n\nDetails: {error_details}" | |
| # ============================================================ | |
| # Sample Articles | |
| # ============================================================ | |
| SAMPLE_ARTICLES = { | |
| "Politics": """TITLE: Senate Passes Historic Climate Legislation in Narrow Vote | |
| ANNOTATED: 2025-12-03 | |
| In a dramatic late-night session, the U.S. Senate passed landmark climate legislation by a razor-thin margin of 51-49, marking a significant shift in the nation's environmental policy. The bill, which allocates $500 billion over ten years for renewable energy infrastructure, sets binding emissions reduction targets of 50% by 2035. | |
| The legislation faced fierce opposition from Republican senators who argued it would devastate the fossil fuel industry and lead to massive job losses. Senator John Smith (R-TX) called it "an economic disaster waiting to happen," while environmental groups hailed it as a "historic breakthrough." | |
| Democrats, who hold a slim majority, managed to keep their caucus united despite concerns from moderate members about the bill's impact on energy prices. Vice President Harris was present in the chamber in case a tie-breaking vote was needed. | |
| The bill now heads to the House of Representatives, where Speaker Johnson has indicated it will face a contentious debate. Industry leaders have expressed mixed reactions, with renewable energy companies welcoming the investment while traditional energy producers warn of a rushed transition.""", | |
| "Health": """TITLE: FDA Approves Revolutionary Gene Therapy for Rare Childhood Disease | |
| ANNOTATED: 2025-11-15 | |
| The Food and Drug Administration announced today the approval of a groundbreaking gene therapy treatment for spinal muscular atrophy (SMA), a rare genetic disorder affecting approximately 1 in 10,000 newborns. The therapy, developed by Novartis subsidiary AveXis, represents only the third gene therapy ever approved in the United States. | |
| Clinical trials showed remarkable results, with 90% of treated infants achieving motor milestones they would never have reached without treatment. The therapy works by delivering a functional copy of the SMN1 gene, which is defective in SMA patients. | |
| However, the treatment comes with a staggering price tag of $2.1 million per patient, making it one of the most expensive drugs ever approved. Insurance companies and patient advocacy groups have raised concerns about accessibility. | |
| Dr. Sarah Chen, lead researcher at Children's Hospital Boston, called the approval "a watershed moment for genetic medicine," while cautioning that long-term follow-up studies are still ongoing.""", | |
| "Science/Environment": """TITLE: Scientists Discover high high high high high High Record-Breaking high High High High high high high high High High High High High High high high high CO2 Absorption in Amazon Rainforest | |
| ANNOTATED: 2025-10-22 | |
| A team of international researchers has documented unprecedented levels of carbon dioxide absorption in previously unstudied regions of the Amazon rainforest, according to a study published today in Nature Climate Change. | |
| Using satellite imagery and ground-based sensors, the team found that certain areas of old-growth forest are absorbing up to 40% more CO2 than previous estimates suggested. The findings could significantly alter climate models and carbon budget calculations. | |
| Lead author Dr. Maria Santos from the University of Sao Paulo said the discovery "challenges our understanding of the Amazon's role as a carbon sink." The research involved scientists from 12 countries and took five years to complete. | |
| However, the researchers warned that deforestation rates in Brazil have increased 23% in the past year, threatening these crucial carbon-absorbing regions. Environmental groups are calling for immediate action to protect the newly identified high-absorption zones.""", | |
| "Business": """TITLE: Tech Giant Announces Largest Layoff in Company History | |
| ANNOTATED: 2025-09-30 | |
| Silicon Valley was rocked today as MegaTech Corporation announced plans to eliminate 15,000 jobs, representing 12% of its global workforce. The layoffs, which will primarily affect the company's cloud computing and advertising divisions, are the largest in the company's 25-year history. | |
| CEO Michael Roberts attributed the decision to "challenging macroeconomic conditions" and the need to "realign resources toward artificial intelligence initiatives." The company's stock initially dropped 8% on the news before recovering to close down 3%. | |
| The announcement comes just weeks after MegaTech reported quarterly profits of $12.4 billion, leading critics to question the necessity of the cuts. Labor advocates called the move "corporate greed at its worst." | |
| Affected employees will receive severance packages equal to 16 weeks of salary plus an additional two weeks for each year of service. The company expects to save $2.5 billion annually from the restructuring.""" | |
| } | |
| def load_sample(domain): | |
| return SAMPLE_ARTICLES.get(domain, SAMPLE_ARTICLES["Politics"]) | |
| # ============================================================ | |
| # Gradio Interface | |
| # ============================================================ | |
| with gr.Blocks(title="NewsScope: Cross-Domain News Claim Extraction") as demo: | |
| gr.Markdown(""" | |
| # NewsScope: Cross-Domain News Claim Extraction | |
| **Schema-grounded claim extraction and fact-checking across multiple news domains.** | |
| **Note:** If the model is warming up after inactivity, please wait 30-60 seconds for the first response. | |
| --- | |
| **How to use:** | |
| 1. Select a news domain (Politics, Health, Science/Environment, or Business) | |
| 2. Paste your article text OR click "Load Sample" to try a demo | |
| 3. Click "Analyze" to get structured extraction + web fact-checking | |
| **Paper:** [arXiv:2601.08852](https://arxiv.org/abs/2601.08852) | **Code:** [GitHub](https://github.com/nidhip1611/NewsScope) | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| domain_dropdown = gr.Dropdown( | |
| choices=["Politics", "Health", "Science/Environment", "Business"], | |
| value="Politics", | |
| label="Select News Domain", | |
| info="Choose the domain that best matches your article" | |
| ) | |
| load_sample_btn = gr.Button("Load Sample Article", variant="secondary") | |
| with gr.Column(scale=2): | |
| article_input = gr.Textbox( | |
| lines=15, | |
| label="Article Text", | |
| placeholder="TITLE: Your Article Title\nANNOTATED: 2025-01-20\n\n[Paste your full article text here...]\n\nYou can paste any news article - the system will analyze it and fact-check claims against trusted web sources.", | |
| info="Format: TITLE on line 1, ANNOTATED date on line 2, then article text" | |
| ) | |
| analyze_btn = gr.Button("Analyze Article", variant="primary", size="lg") | |
| output_box = gr.Textbox( | |
| lines=35, | |
| label="NewsScope Analysis" | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| **About NewsScope:** | |
| - **89.4% accuracy** on human-evaluated claims (400 claims across 4 domains) | |
| - **98.8% schema validity** -- structured output every time | |
| - **Cross-domain:** Works on Politics, Health, Science/Environment, and Business news | |
| - **Web fact-checking:** Claims verified against trusted sources (.gov, .edu, Reuters, AP, etc.) | |
| Built with LLaMA 3.1 8B + LoRA fine-tuning. Licensed under MIT (code) and CC-BY-4.0 (dataset annotations). | |
| """) | |
| load_sample_btn.click( | |
| fn=load_sample, | |
| inputs=[domain_dropdown], | |
| outputs=[article_input] | |
| ) | |
| analyze_btn.click( | |
| fn=newsscope_api, | |
| inputs=[domain_dropdown, article_input], | |
| outputs=[output_box] | |
| ) | |
| demo.queue(max_size=10).launch() |