Spaces:

nidhipandya
/

NewsScope

Sleeping

App Files Files Community

NewsScope / app.py

nidhipandya

Update app.py

897e7bb verified 3 months ago

raw

history blame contribute delete

24.7 kB

	import os
	os.environ["OMP_NUM_THREADS"] = "1"
	os.environ["OPENBLAS_NUM_THREADS"] = "1"
	os.environ["MKL_NUM_THREADS"] = "1"
	os.environ["NUMEXPR_NUM_THREADS"] = "1"
	import json
	import time
	import gradio as gr
	import torch
	from peft import PeftModel
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	from openai import OpenAI
	from googleapiclient.discovery import build
	from huggingface_hub import login

	# ============================================================
	# CUDA / GPU Check
	# ============================================================
	import subprocess

	print("Is CUDA available:", torch.cuda.is_available())
	print("Torch CUDA build:", torch.version.cuda)

	try:
	out = subprocess.check_output(["nvidia-smi"], text=True)
	print("nvidia-smi OK:\n", out[:800])
	except Exception as e:
	print("nvidia-smi FAILED:", repr(e))

	# ============================================================
	# Login to HuggingFace (for private model access)
	# ============================================================
	HF_TOKEN = os.environ.get("HF_TOKEN")
	if HF_TOKEN:
	login(token=HF_TOKEN)

	# ============================================================
	# Initialize API clients
	# ============================================================
	openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
	google_api_key = os.environ.get("GOOGLE_API_KEY")
	google_cse_id = os.environ.get("GOOGLE_CSE_ID")

	# ============================================================
	# Model Configuration
	# ============================================================
	BASE_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
	LORA_MODEL = "nidhipandya/nidhi-llama-lora"

	# ============================================================
	# Load Model
	# ============================================================
	print("Loading NewsScope model...")

	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_use_double_quant=True,
	bnb_4bit_compute_dtype=torch.float16,
	)

	tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
	tok.pad_token = tok.eos_token
	tok.padding_side = "left"

	base_model = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL,
	device_map="auto",
	torch_dtype=torch.float16,
	quantization_config=bnb_config,
	low_cpu_mem_usage=True,
	)

	base_model = PeftModel.from_pretrained(base_model, LORA_MODEL)
	base_model.eval()

	print("NewsScope model loaded successfully")

	# ============================================================
	# System Prompt (Original VERA prompt - works with your LoRA)
	# ============================================================
	SYSTEM_PROMPT = """You are NewsScope, a neutral news analyst who produces structured summaries.
	You will be given a single news article with:
	- TITLE
	- ANNOTATED timestamp
	- Full article text
	You MUST reply ONLY with this exact structure:
	TITLE: [exactly as in article]
	ANNOTATED: [exactly as given]

	HEADLINE
	[1 short, neutral headline sentence]

	SUMMARY (Key Points)
	[1-3 short PARAGRAPHS in plain text. NO bullet points. NO numbered lists.]

	IN SIMPLE TERMS (ELI12)
	[2-3 sentences explaining the story simply]

	WHO IS INVOLVED?
	[Bullet list of people/institutions from article only]

	HOW THE STORY UNFOLDED
	[Short chronological narrative of events from article only.]

	VERIFIED CLAIMS & EVIDENCE
	Central Claim: [one main claim from article]
	Evidence: [facts, numbers, quotes from article only]
	Why it matters: [1-3 sentences using article info only]
	Confidence: High/Medium/Low

	CRITICAL RULES:
	- Use ONLY information in the article
	- Do NOT invent specific years, dates, or numbers
	- Do NOT add organizations not explicitly mentioned
	- Keep neutral, analytical tone"""

	# ============================================================
	# Helper Functions
	# ============================================================
	def get_trust_level(url):
	url_lower = url.lower()
	high_trust = [".gov", "congress.gov", "senate.gov", "whitehouse.gov", ".edu",
	"apnews.com", "reuters.com", "bbc.com", "bbc.co.uk", "cdc.gov",
	"nih.gov", "who.int", "nasa.gov", "noaa.gov"]
	medium_trust = ["nytimes.com", "washingtonpost.com", "wsj.com", "theguardian.com",
	"economist.com", "npr.org", "pbs.org", "nature.com", "sciencemag.org",
	"bloomberg.com"]

	for domain in high_trust:
	if domain in url_lower:
	return "HIGH"
	for domain in medium_trust:
	if domain in url_lower:
	return "MEDIUM"
	return "LOW"

	def search_claim(claim, max_results=3):
	try:
	service = build("customsearch", "v1", developerKey=google_api_key)
	result = service.cse().list(q=claim, cx=google_cse_id, num=max_results).execute()
	items = result.get("items", [])
	filtered = []
	for item in items:
	url = item.get("link", "")
	trust = get_trust_level(url)
	if trust in ["HIGH", "MEDIUM"]:
	filtered.append({
	"title": item.get("title", ""),
	"url": url,
	"snippet": item.get("snippet", ""),
	"trust_level": trust
	})
	return filtered
	except Exception as e:
	print(f"Search error: {e}")
	return []

	def extract_claims(article_text, max_claims=3):
	prompt = f"""Extract {max_claims} specific, verifiable factual claims from this article.
	Article:
	{article_text}
	Return ONLY a JSON array of strings. Example:
	["The Senate passed legislation by 51-49", "The bill allocates $500 billion"]
	Focus on specific numbers, dates, votes, concrete policy details."""

	try:
	response = openai_client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": "You extract verifiable claims from news articles. Return only JSON."},
	{"role": "user", "content": prompt}
	],
	temperature=0.3,
	max_tokens=200
	)
	content = response.choices[0].message.content.strip()
	if content.startswith("```json"):
	content = content.replace("```json", "").replace("```", "").strip()
	claims = json.loads(content)
	return claims[:max_claims]
	except Exception as e:
	print(f"Extraction failed: {e}")
	return []

	def fact_check_claim(claim, max_sources=3):
	sources = search_claim(claim, max_results=max_sources)

	if not sources:
	return {
	"claim": claim,
	"status": "UNCLEAR",
	"confidence": "LOW",
	"summary": "No high-trust sources found to verify this claim.",
	"sources": []
	}

	sources_text = "\n\n".join([
	f"Source {i+1} ({s['trust_level']}): {s['title']}\n{s['snippet']}"
	for i, s in enumerate(sources)
	])

	synthesis_prompt = f"""Based on these sources, evaluate this claim:
	Claim: {claim}
	Sources:
	{sources_text}
	Respond with JSON:
	{{"status": "SUPPORTED" \| "CONTRADICTED" \| "MIXED" \| "UNCLEAR", "confidence": "HIGH" \| "MEDIUM" \| "LOW", "summary": "One sentence explaining what sources say"}}"""

	try:
	response = openai_client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": "You verify claims against sources. Return only JSON."},
	{"role": "user", "content": synthesis_prompt}
	],
	temperature=0.3,
	max_tokens=150
	)
	content = response.choices[0].message.content.strip()
	if content.startswith("```json"):
	content = content.replace("```json", "").replace("```", "").strip()
	verdict = json.loads(content)
	return {
	"claim": claim,
	"status": verdict.get("status", "UNCLEAR"),
	"confidence": verdict.get("confidence", "LOW"),
	"summary": verdict.get("summary", ""),
	"sources": sources
	}
	except Exception as e:
	return {
	"claim": claim,
	"status": "ERROR",
	"confidence": "LOW",
	"summary": f"Error during synthesis: {str(e)}",
	"sources": sources
	}

	def fact_check_article(article_text, max_claims=3, sources_per_claim=3):
	claims = extract_claims(article_text, max_claims=max_claims)
	if not claims:
	return []

	results = []
	for claim in claims:
	result = fact_check_claim(claim, max_sources=sources_per_claim)
	results.append(result)
	time.sleep(1)
	return results

	def generate_implications(fact_results):
	"""Generate implications based on fact-check results"""
	if not fact_results:
	return ""

	claims_summary = "\n".join([
	f"- {r['claim'][:100]}: {r['status']}" for r in fact_results
	])

	prompt = f"""Based on these fact-checked claims and their verdicts, write exactly 3 bullet points about implications.
	Each bullet must be exactly 1 sentence.

	Claims:
	{claims_summary}

	Write 3 implications covering:
	1. Public perception impact
	2. Policy substance
	3. Information quality

	Format as:
	- Public perception: [one sentence]
	- Policy substance: [one sentence]
	- Information quality: [one sentence]"""

	try:
	response = openai_client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": "You analyze implications of fact-checked news claims. Be concise."},
	{"role": "user", "content": prompt}
	],
	temperature=0.5,
	max_tokens=200
	)
	return response.choices[0].message.content.strip()
	except Exception as e:
	print(f"Implications generation failed: {e}")
	return ""

	# ============================================================
	# Generate NewsScope Summary
	# ============================================================
	def generate_newsscope_summary(article_text):
	lines = article_text.splitlines()
	title_line = lines[0].strip() if len(lines) > 0 else "TITLE: Unknown"
	annotated_line = lines[1].strip() if len(lines) > 1 else "ANNOTATED: Unknown"

	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": article_text}
	]

	prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = tok(prompt, return_tensors="pt").to(base_model.device)

	with torch.no_grad():
	outputs = base_model.generate(
	**inputs,
	max_new_tokens=700,
	do_sample=True,
	temperature=0.3,
	top_p=0.9,
	pad_token_id=tok.pad_token_id,
	eos_token_id=tok.eos_token_id
	)

	generated_ids = outputs[0][inputs["input_ids"].shape[-1]:]
	raw = tok.decode(generated_ids, skip_special_tokens=True).strip()

	# Clean up any system artifacts
	clean_lines = []
	for line in raw.splitlines():
	if "system=" in line.lower():
	continue
	if "Identifier Separator" in line:
	continue
	if "Manual separation" in line:
	continue
	clean_lines.append(line)
	raw = "\n".join(clean_lines).strip()

	if not raw.startswith("TITLE:"):
	raw = f"{title_line}\n{annotated_line}\n\n" + raw

	return raw

	# ============================================================
	# Enrich Output with Fact-Checks (Production Version)
	# ============================================================
	def enrich_newsscope_output(newsscope_summary, fact_check_results, domain="Politics"):
	from urllib.parse import urlparse

	if not fact_check_results:
	return newsscope_summary

	# Build the enriched fact-check section
	lines = []
	lines.append("")
	lines.append("=" * 70)
	lines.append("VERIFIED CLAIMS & EVIDENCE (WITH WEB FACT-CHECK)")
	lines.append("=" * 70)
	lines.append("")

	# Quick verdicts
	lines.append("Quick verdicts")
	for result in fact_check_results[:3]:
	claim_text = result.get("claim", "").strip()
	words = claim_text.split()
	claim_short = " ".join(words[:6])
	if len(words) > 6:
	claim_short += "..."

	status = result.get("status", "UNCLEAR").upper()
	if status == "SUPPORTED":
	icon = "[OK]"
	elif status == "CONTRADICTED":
	icon = "[X]"
	else:
	icon = "[?]"

	lines.append(f"- {claim_short}: {icon} {status.capitalize()}")

	lines.append("")
	lines.append("-" * 70)
	lines.append("")

	# Detailed claims with sources
	for idx, result in enumerate(fact_check_results[:3], 1):
	claim_text = result.get("claim", "").strip()
	if len(claim_text) > 200:
	claim_text = claim_text[:197] + "..."

	status = result.get("status", "UNCLEAR").upper()
	confidence = result.get("confidence", "LOW").upper()
	summary = result.get("summary", "").strip().replace("\n", " ")

	if status == "SUPPORTED":
	icon = "[OK]"
	elif status == "CONTRADICTED":
	icon = "[X]"
	else:
	icon = "[?]"

	lines.append(f"CLAIM {idx}")
	lines.append(claim_text)
	lines.append("")
	lines.append("Evidence (from article)")
	lines.append("As stated in the article above.")
	lines.append("")
	lines.append("Fact check (web)")
	lines.append(f"{icon} {status} -- Confidence: {confidence}. {summary}")
	lines.append("")

	# Add source with clickable link format
	sources = result.get("sources", [])
	if sources:
	trust_order = {"HIGH": 3, "MEDIUM": 2, "LOW": 1}
	best_source = max(sources, key=lambda s: trust_order.get(s.get("trust_level", "LOW"), 0))
	title = best_source.get("title", "").strip()
	url = best_source.get("url", "").strip()

	# Extract domain for display
	try:
	domain_name = urlparse(url).netloc.replace("www.", "")
	except:
	domain_name = "source"

	lines.append("Source")
	lines.append(f"- [{title} ({domain_name})]({url})")
	else:
	lines.append("Source")
	lines.append("- No high-trust sources found for this claim.")

	lines.append("")
	lines.append("-" * 70)
	lines.append("")

	# Generate and add implications
	implications = generate_implications(fact_check_results)
	if implications:
	lines.append("")
	lines.append("=" * 70)
	lines.append("IMPLICATIONS (BASED ON ARTICLE + WEB SOURCES)")
	lines.append("=" * 70)
	lines.append(implications)
	lines.append("")

	new_section_text = "\n".join(lines)

	# Replace old VERIFIED section or append
	header = "VERIFIED CLAIMS & EVIDENCE"
	if header in newsscope_summary:
	before, _, _ = newsscope_summary.partition(header)
	enriched = before.rstrip() + "\n" + new_section_text
	else:
	enriched = newsscope_summary.rstrip() + "\n" + new_section_text

	return enriched

	# ============================================================
	# Main NewsScope API Function
	# ============================================================
	def newsscope_api(domain, article_text):
	if not article_text or len(article_text.strip()) < 50:
	return "ERROR: Please enter an article with at least 50 characters."

	try:
	# Step 1: Generate summary
	print("Generating NewsScope summary...")
	newsscope_output = generate_newsscope_summary(article_text)

	# Step 2: Fact-check claims
	print("Fact-checking claims with web search...")
	fact_results = fact_check_article(article_text, max_claims=3, sources_per_claim=3)

	# Step 3: Enrich with fact-checks
	print("Enriching output with fact-checks...")
	enriched_output = enrich_newsscope_output(newsscope_output, fact_results, domain)

	# Final formatted output
	output = f"""
	======================================================================
	NEWSSCOPE ANALYSIS -- Domain: {domain}
	======================================================================

	{enriched_output}

	======================================================================
	ANALYSIS COMPLETE
	======================================================================
	"""
	return output

	except Exception as e:
	import traceback
	error_details = traceback.format_exc()
	print(f"Error: {error_details}")
	return f"ERROR: {str(e)}\n\nDetails: {error_details}"

	# ============================================================
	# Sample Articles
	# ============================================================
	SAMPLE_ARTICLES = {
	"Politics": """TITLE: Senate Passes Historic Climate Legislation in Narrow Vote
	ANNOTATED: 2025-12-03

	In a dramatic late-night session, the U.S. Senate passed landmark climate legislation by a razor-thin margin of 51-49, marking a significant shift in the nation's environmental policy. The bill, which allocates $500 billion over ten years for renewable energy infrastructure, sets binding emissions reduction targets of 50% by 2035.

	The legislation faced fierce opposition from Republican senators who argued it would devastate the fossil fuel industry and lead to massive job losses. Senator John Smith (R-TX) called it "an economic disaster waiting to happen," while environmental groups hailed it as a "historic breakthrough."

	Democrats, who hold a slim majority, managed to keep their caucus united despite concerns from moderate members about the bill's impact on energy prices. Vice President Harris was present in the chamber in case a tie-breaking vote was needed.

	The bill now heads to the House of Representatives, where Speaker Johnson has indicated it will face a contentious debate. Industry leaders have expressed mixed reactions, with renewable energy companies welcoming the investment while traditional energy producers warn of a rushed transition.""",

	"Health": """TITLE: FDA Approves Revolutionary Gene Therapy for Rare Childhood Disease
	ANNOTATED: 2025-11-15

	The Food and Drug Administration announced today the approval of a groundbreaking gene therapy treatment for spinal muscular atrophy (SMA), a rare genetic disorder affecting approximately 1 in 10,000 newborns. The therapy, developed by Novartis subsidiary AveXis, represents only the third gene therapy ever approved in the United States.

	Clinical trials showed remarkable results, with 90% of treated infants achieving motor milestones they would never have reached without treatment. The therapy works by delivering a functional copy of the SMN1 gene, which is defective in SMA patients.

	However, the treatment comes with a staggering price tag of $2.1 million per patient, making it one of the most expensive drugs ever approved. Insurance companies and patient advocacy groups have raised concerns about accessibility.

	Dr. Sarah Chen, lead researcher at Children's Hospital Boston, called the approval "a watershed moment for genetic medicine," while cautioning that long-term follow-up studies are still ongoing.""",

	"Science/Environment": """TITLE: Scientists Discover high high high high high High Record-Breaking high High High High high high high high High High High High High High high high high CO2 Absorption in Amazon Rainforest
	ANNOTATED: 2025-10-22

	A team of international researchers has documented unprecedented levels of carbon dioxide absorption in previously unstudied regions of the Amazon rainforest, according to a study published today in Nature Climate Change.

	Using satellite imagery and ground-based sensors, the team found that certain areas of old-growth forest are absorbing up to 40% more CO2 than previous estimates suggested. The findings could significantly alter climate models and carbon budget calculations.

	Lead author Dr. Maria Santos from the University of Sao Paulo said the discovery "challenges our understanding of the Amazon's role as a carbon sink." The research involved scientists from 12 countries and took five years to complete.

	However, the researchers warned that deforestation rates in Brazil have increased 23% in the past year, threatening these crucial carbon-absorbing regions. Environmental groups are calling for immediate action to protect the newly identified high-absorption zones.""",

	"Business": """TITLE: Tech Giant Announces Largest Layoff in Company History
	ANNOTATED: 2025-09-30

	Silicon Valley was rocked today as MegaTech Corporation announced plans to eliminate 15,000 jobs, representing 12% of its global workforce. The layoffs, which will primarily affect the company's cloud computing and advertising divisions, are the largest in the company's 25-year history.

	CEO Michael Roberts attributed the decision to "challenging macroeconomic conditions" and the need to "realign resources toward artificial intelligence initiatives." The company's stock initially dropped 8% on the news before recovering to close down 3%.

	The announcement comes just weeks after MegaTech reported quarterly profits of $12.4 billion, leading critics to question the necessity of the cuts. Labor advocates called the move "corporate greed at its worst."

	Affected employees will receive severance packages equal to 16 weeks of salary plus an additional two weeks for each year of service. The company expects to save $2.5 billion annually from the restructuring."""
	}

	def load_sample(domain):
	return SAMPLE_ARTICLES.get(domain, SAMPLE_ARTICLES["Politics"])

	# ============================================================
	# Gradio Interface
	# ============================================================
	with gr.Blocks(title="NewsScope: Cross-Domain News Claim Extraction") as demo:
	gr.Markdown("""
	# NewsScope: Cross-Domain News Claim Extraction

	Schema-grounded claim extraction and fact-checking across multiple news domains.

	Note: If the model is warming up after inactivity, please wait 30-60 seconds for the first response.

	---

	How to use:
	1. Select a news domain (Politics, Health, Science/Environment, or Business)
	2. Paste your article text OR click "Load Sample" to try a demo
	3. Click "Analyze" to get structured extraction + web fact-checking

	Paper: [arXiv:2601.08852](https://arxiv.org/abs/2601.08852) \| Code: [GitHub](https://github.com/nidhip1611/NewsScope)
	""")

	with gr.Row():
	with gr.Column(scale=1):
	domain_dropdown = gr.Dropdown(
	choices=["Politics", "Health", "Science/Environment", "Business"],
	value="Politics",
	label="Select News Domain",
	info="Choose the domain that best matches your article"
	)
	load_sample_btn = gr.Button("Load Sample Article", variant="secondary")

	with gr.Column(scale=2):
	article_input = gr.Textbox(
	lines=15,
	label="Article Text",
	placeholder="TITLE: Your Article Title\nANNOTATED: 2025-01-20\n\n[Paste your full article text here...]\n\nYou can paste any news article - the system will analyze it and fact-check claims against trusted web sources.",
	info="Format: TITLE on line 1, ANNOTATED date on line 2, then article text"
	)

	analyze_btn = gr.Button("Analyze Article", variant="primary", size="lg")

	output_box = gr.Textbox(
	lines=35,
	label="NewsScope Analysis"
	)

	gr.Markdown("""
	---

	About NewsScope:
	- 89.4% accuracy on human-evaluated claims (400 claims across 4 domains)
	- 98.8% schema validity -- structured output every time
	- Cross-domain: Works on Politics, Health, Science/Environment, and Business news
	- Web fact-checking: Claims verified against trusted sources (.gov, .edu, Reuters, AP, etc.)

	Built with LLaMA 3.1 8B + LoRA fine-tuning. Licensed under MIT (code) and CC-BY-4.0 (dataset annotations).
	""")

	load_sample_btn.click(
	fn=load_sample,
	inputs=[domain_dropdown],
	outputs=[article_input]
	)

	analyze_btn.click(
	fn=newsscope_api,
	inputs=[domain_dropdown, article_input],
	outputs=[output_box]
	)

	demo.queue(max_size=10).launch()