Spaces:

itshiroto
/

hoax-detection-indo

Runtime error

Rivo Juicer Wowor

update

9c7d5c0 unverified about 1 year ago

11.9 kB

	import os
	import json
	import pandas as pd
	from typing import Dict, List, Optional, SecretStr
	from dotenv import load_dotenv

	# LangChain Imports
	from langchain.embeddings import OpenAIEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.schema import Document
	from langchain_openai import ChatOpenAI
	from langchain.prompts import ChatPromptTemplate
	from langchain.tools import tool
	from langchain.agents import AgentExecutor
	from langchain.pydantic_v1 import BaseModel, Field

	# LangGraph
	from langgraph.graph import StateGraph, END

	# SerpApi for real-time web search
	from langchain.utilities import SerpAPIWrapper

	# Load environment variables
	load_dotenv()

	# Configuration - You can set these directly in the script or use environment variables
	OPENAI_API_KEY = os.environ.get(
	"OPENAI_API_KEY",
	"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",
	)
	OPENAI_API_BASE = os.environ.get(
	"OPENAI_API_BASE", "https://openrouter.ai/api/v1"
	) # URL for OpenAI-compatible server

	# Initialize necessary API keys
	# os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
	# os.environ["SERPAPI_API_KEY"] = os.getenv("SERPAPI_API_KEY")

	# Initialize models and tools
	llm = ChatOpenAI(
	model="meta-llama/llama-3.3-70b-instruct",
	api_key=SecretStr(OPENAI_API_KEY),
	base_url=OPENAI_API_BASE,
	temperature=0.1,
	)
	embeddings = OpenAIEmbeddings()
	search = SerpAPIWrapper()


	# Define the State schema
	class AgentState(BaseModel):
	"""State for the Hoax Detection Agent."""

	input_article: str = Field(description="The news article to be verified")
	hoax_db_results: Optional[List[Dict]] = Field(
	default=None, description="Results from the hoax database"
	)
	web_search_results: Optional[List[Dict]] = Field(
	default=None, description="Results from web search"
	)
	similarity_scores: Optional[Dict] = Field(
	default=None,
	description="Similarity metrics between input and retrieved articles",
	)
	initial_assessment: Optional[Dict] = Field(
	default=None, description="Initial automated assessment"
	)
	final_verdict: Optional[Dict] = Field(
	default=None, description="Final verdict from the LLM Judge"
	)


	# Setup ChromaDB for hoax news data
	def setup_hoax_database(csv_path: str) -> Chroma:
	"""Setup and populate ChromaDB with hoax news data from a CSV file."""
	# Load the CSV file
	df = pd.read_csv(csv_path)

	# Convert dataframe rows to documents
	documents = []
	for _, row in df.iterrows():
	metadata = {col: row[col] for col in df.columns if col != "content"}
	doc = Document(page_content=row["content"], metadata=metadata)
	documents.append(doc)

	# Create and return the vector store
	return Chroma.from_documents(
	documents=documents, embedding=embeddings, collection_name="hoax_news"
	)


	# Tool for querying the hoax database
	@tool("query_hoax_database")
	def query_hoax_database(query: str, db: Chroma, k: int = 3) -> List[Dict]:
	"""Query the hoax news database for similar articles."""
	results = db.similarity_search_with_score(query, k=k)
	formatted_results = []

	for doc, score in results:
	formatted_results.append(
	{
	"content": doc.page_content,
	"metadata": doc.metadata,
	"similarity_score": score,
	}
	)

	return formatted_results


	# Tool for web search using SerpAPI
	@tool("search_web")
	def search_web(query: str) -> List[Dict]:
	"""Search the web for real news articles related to the query."""
	results = search.results(query)

	# Extract and format relevant information
	formatted_results = []
	if "organic_results" in results:
	for result in results["organic_results"][:3]: # Limit to top 3 results
	formatted_results.append(
	{
	"title": result.get("title", ""),
	"link": result.get("link", ""),
	"snippet": result.get("snippet", ""),
	}
	)

	return formatted_results


	# Tool for calculating similarity and initial assessment
	@tool("analyze_similarity")
	def analyze_similarity(
	input_article: str, hoax_results: List[Dict], web_results: List[Dict]
	) -> Dict:
	"""Analyze similarity between input article and retrieved articles."""
	# Calculate similarity metrics
	similarity_scores = {
	"average_hoax_similarity": sum(r["similarity_score"] for r in hoax_results)
	/ len(hoax_results)
	if hoax_results
	else 0,
	"hoax_matches": [
	{"content": r["content"], "score": r["similarity_score"]}
	for r in hoax_results
	],
	"web_relevance": len(web_results),
	}

	# Generate initial assessment
	assessment = {
	"similarity_scores": similarity_scores,
	"initial_indication": "Suspicious"
	if similarity_scores["average_hoax_similarity"] > 0.75
	else "Likely Legitimate",
	}

	return assessment


	# Tool for LLM Judge to provide final verdict
	@tool("llm_judge")
	def llm_judge(
	input_article: str,
	hoax_results: List[Dict],
	web_results: List[Dict],
	similarity_analysis: Dict,
	) -> Dict:
	"""LLM-as-a-Judge to provide final verdict on the article's authenticity."""

	# Create a prompt for the LLM Judge
	judge_prompt = ChatPromptTemplate.from_messages(
	[
	(
	"system",
	"""You are an expert fact-checker and misinformation analyst.
	Your task is to determine if the provided news article is a hoax or real news.
	Analyze all the evidence provided: similarity to known hoaxes, web search results,
	and contextual indicators. Provide a verdict, confidence score, and detailed justification.""",
	),
	(
	"user",
	"""
	# News Article to Verify
	{input_article}

	# Similarity Analysis
	{similarity_analysis}

	# Known Hoax Matches
	{hoax_matches}

	# Web Search Results
	{web_results}

	Determine if this article is a hoax or real news. Provide:
	1. Final verdict (HOAX or REAL)
	2. Confidence score (0-100%)
	3. Detailed justification for your decision
	""",
	),
	]
	)

	# Extract relevant data for the prompt
	hoax_matches_text = json.dumps(
	[
	{"content": r["content"], "similarity": r["similarity_score"]}
	for r in hoax_results
	],
	indent=2,
	)
	web_results_text = json.dumps(web_results, indent=2)
	similarity_analysis_text = json.dumps(similarity_analysis, indent=2)

	# Get response from LLM
	chain = judge_prompt \| llm
	response = chain.invoke(
	{
	"input_article": input_article,
	"similarity_analysis": similarity_analysis_text,
	"hoax_matches": hoax_matches_text,
	"web_results": web_results_text,
	}
	)

	# Parse the LLM response to extract verdict and confidence
	content = response.content

	# Simplified parsing (in a real implementation, use more robust parsing)
	verdict = "HOAX" if "HOAX" in content.upper() else "REAL"

	# Extract confidence score (simplified)
	confidence = 0
	for line in content.split("\n"):
	if "confidence" in line.lower() and "%" in line:
	try:
	confidence = int("".join(filter(str.isdigit, line.split("%")[0][-3:])))
	except:
	confidence = 80 # Default if parsing fails

	return {"verdict": verdict, "confidence": confidence, "justification": content}


	# Node functions for the graph
	def retrieve_from_databases(state: AgentState) -> AgentState:
	"""Retrieve information from hoax database and web search."""
	hoax_db = setup_hoax_database("hoax_news_data.csv") # Assuming this file exists

	# Query hoax database
	hoax_results = query_hoax_database(state.input_article, hoax_db)

	# Search web
	web_results = search_web(state.input_article)

	return AgentState(
	**{
	**state.dict(),
	"hoax_db_results": hoax_results,
	"web_search_results": web_results,
	}
	)


	def analyze_and_assess(state: AgentState) -> AgentState:
	"""Analyze similarity and provide initial assessment."""
	analysis = analyze_similarity(
	state.input_article, state.hoax_db_results, state.web_search_results
	)

	return AgentState(
	**{
	**state.dict(),
	"similarity_scores": analysis["similarity_scores"],
	"initial_assessment": {"indication": analysis["initial_indication"]},
	}
	)


	def make_final_judgment(state: AgentState) -> AgentState:
	"""Make final judgment using LLM-as-a-Judge."""
	verdict = llm_judge(
	state.input_article,
	state.hoax_db_results,
	state.web_search_results,
	{
	"similarity_scores": state.similarity_scores,
	"initial_assessment": state.initial_assessment,
	},
	)

	return AgentState({state.dict(), "final_verdict": verdict})


	# Create the Hoax Detection workflow using LangGraph
	def create_hoax_detection_workflow() -> AgentExecutor:
	"""Create the full Hoax Detection workflow using LangGraph."""

	# Define the graph
	workflow = StateGraph(AgentState)

	# Add nodes
	workflow.add_node("retrieve", retrieve_from_databases)
	workflow.add_node("analyze", analyze_and_assess)
	workflow.add_node("judge", make_final_judgment)

	# Add edges
	workflow.add_edge("retrieve", "analyze")
	workflow.add_edge("analyze", "judge")
	workflow.add_edge("judge", END)

	# Set entry point
	workflow.set_entry_point("retrieve")

	# Compile the graph
	hoax_detection_app = workflow.compile()

	return hoax_detection_app


	# Main function to run the hoax detection
	def detect_hoax(article_text: str) -> Dict:
	"""Run the full hoax detection process on an article."""
	# Create the workflow
	hoax_detector = create_hoax_detection_workflow()

	# Initialize state with input article
	initial_state = AgentState(input_article=article_text)

	# Run the workflow
	final_state = hoax_detector.invoke(initial_state)

	# Return the final verdict
	return {
	"classification": final_state.final_verdict["verdict"],
	"confidence": final_state.final_verdict["confidence"],
	"justification": final_state.final_verdict["justification"],
	}


	# Example usage
	if __name__ == "__main__":
	# Sample article for testing
	test_article = """
	BREAKING: Scientists Discover Microchips in COVID-19 Vaccines

	Researchers at a leading independent laboratory have found microscopic tracking devices
	in samples of COVID-19 vaccines. These microchips, barely visible under an electron
	microscope, are allegedly capable of tracking individuals and transmitting data to
	satellite systems. Government officials have declined to comment on these findings,
	raising further suspicions about the true purpose of the global vaccination campaign.

	The discovery was made by Dr. James Wilson, who claims he was subsequently threatened
	with losing his research funding if he publicized the results. Several other scientists
	have allegedly corroborated these findings but remain anonymous out of fear for their careers.
	"""

	# Run the detection
	result = detect_hoax(test_article)

	# Print results
	print("\n===== HOAX DETECTION RESULTS =====")
	print(f"Classification: {result['classification']}")
	print(f"Confidence: {result['confidence']}%")
	print("\nJustification:")
	print(result["justification"])