Spaces:
Runtime error
Runtime error
| import os | |
| import json | |
| import pandas as pd | |
| from typing import Dict, List, Optional, SecretStr | |
| from dotenv import load_dotenv | |
| # LangChain Imports | |
| from langchain.embeddings import OpenAIEmbeddings | |
| from langchain.vectorstores import Chroma | |
| from langchain.schema import Document | |
| from langchain_openai import ChatOpenAI | |
| from langchain.prompts import ChatPromptTemplate | |
| from langchain.tools import tool | |
| from langchain.agents import AgentExecutor | |
| from langchain.pydantic_v1 import BaseModel, Field | |
| # LangGraph | |
| from langgraph.graph import StateGraph, END | |
| # SerpApi for real-time web search | |
| from langchain.utilities import SerpAPIWrapper | |
| # Load environment variables | |
| load_dotenv() | |
| # Configuration - You can set these directly in the script or use environment variables | |
| OPENAI_API_KEY = os.environ.get( | |
| "OPENAI_API_KEY", | |
| "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", | |
| ) | |
| OPENAI_API_BASE = os.environ.get( | |
| "OPENAI_API_BASE", "https://openrouter.ai/api/v1" | |
| ) # URL for OpenAI-compatible server | |
| # Initialize necessary API keys | |
| # os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") | |
| # os.environ["SERPAPI_API_KEY"] = os.getenv("SERPAPI_API_KEY") | |
| # Initialize models and tools | |
| llm = ChatOpenAI( | |
| model="meta-llama/llama-3.3-70b-instruct", | |
| api_key=SecretStr(OPENAI_API_KEY), | |
| base_url=OPENAI_API_BASE, | |
| temperature=0.1, | |
| ) | |
| embeddings = OpenAIEmbeddings() | |
| search = SerpAPIWrapper() | |
| # Define the State schema | |
| class AgentState(BaseModel): | |
| """State for the Hoax Detection Agent.""" | |
| input_article: str = Field(description="The news article to be verified") | |
| hoax_db_results: Optional[List[Dict]] = Field( | |
| default=None, description="Results from the hoax database" | |
| ) | |
| web_search_results: Optional[List[Dict]] = Field( | |
| default=None, description="Results from web search" | |
| ) | |
| similarity_scores: Optional[Dict] = Field( | |
| default=None, | |
| description="Similarity metrics between input and retrieved articles", | |
| ) | |
| initial_assessment: Optional[Dict] = Field( | |
| default=None, description="Initial automated assessment" | |
| ) | |
| final_verdict: Optional[Dict] = Field( | |
| default=None, description="Final verdict from the LLM Judge" | |
| ) | |
| # Setup ChromaDB for hoax news data | |
| def setup_hoax_database(csv_path: str) -> Chroma: | |
| """Setup and populate ChromaDB with hoax news data from a CSV file.""" | |
| # Load the CSV file | |
| df = pd.read_csv(csv_path) | |
| # Convert dataframe rows to documents | |
| documents = [] | |
| for _, row in df.iterrows(): | |
| metadata = {col: row[col] for col in df.columns if col != "content"} | |
| doc = Document(page_content=row["content"], metadata=metadata) | |
| documents.append(doc) | |
| # Create and return the vector store | |
| return Chroma.from_documents( | |
| documents=documents, embedding=embeddings, collection_name="hoax_news" | |
| ) | |
| # Tool for querying the hoax database | |
| def query_hoax_database(query: str, db: Chroma, k: int = 3) -> List[Dict]: | |
| """Query the hoax news database for similar articles.""" | |
| results = db.similarity_search_with_score(query, k=k) | |
| formatted_results = [] | |
| for doc, score in results: | |
| formatted_results.append( | |
| { | |
| "content": doc.page_content, | |
| "metadata": doc.metadata, | |
| "similarity_score": score, | |
| } | |
| ) | |
| return formatted_results | |
| # Tool for web search using SerpAPI | |
| def search_web(query: str) -> List[Dict]: | |
| """Search the web for real news articles related to the query.""" | |
| results = search.results(query) | |
| # Extract and format relevant information | |
| formatted_results = [] | |
| if "organic_results" in results: | |
| for result in results["organic_results"][:3]: # Limit to top 3 results | |
| formatted_results.append( | |
| { | |
| "title": result.get("title", ""), | |
| "link": result.get("link", ""), | |
| "snippet": result.get("snippet", ""), | |
| } | |
| ) | |
| return formatted_results | |
| # Tool for calculating similarity and initial assessment | |
| def analyze_similarity( | |
| input_article: str, hoax_results: List[Dict], web_results: List[Dict] | |
| ) -> Dict: | |
| """Analyze similarity between input article and retrieved articles.""" | |
| # Calculate similarity metrics | |
| similarity_scores = { | |
| "average_hoax_similarity": sum(r["similarity_score"] for r in hoax_results) | |
| / len(hoax_results) | |
| if hoax_results | |
| else 0, | |
| "hoax_matches": [ | |
| {"content": r["content"], "score": r["similarity_score"]} | |
| for r in hoax_results | |
| ], | |
| "web_relevance": len(web_results), | |
| } | |
| # Generate initial assessment | |
| assessment = { | |
| "similarity_scores": similarity_scores, | |
| "initial_indication": "Suspicious" | |
| if similarity_scores["average_hoax_similarity"] > 0.75 | |
| else "Likely Legitimate", | |
| } | |
| return assessment | |
| # Tool for LLM Judge to provide final verdict | |
| def llm_judge( | |
| input_article: str, | |
| hoax_results: List[Dict], | |
| web_results: List[Dict], | |
| similarity_analysis: Dict, | |
| ) -> Dict: | |
| """LLM-as-a-Judge to provide final verdict on the article's authenticity.""" | |
| # Create a prompt for the LLM Judge | |
| judge_prompt = ChatPromptTemplate.from_messages( | |
| [ | |
| ( | |
| "system", | |
| """You are an expert fact-checker and misinformation analyst. | |
| Your task is to determine if the provided news article is a hoax or real news. | |
| Analyze all the evidence provided: similarity to known hoaxes, web search results, | |
| and contextual indicators. Provide a verdict, confidence score, and detailed justification.""", | |
| ), | |
| ( | |
| "user", | |
| """ | |
| # News Article to Verify | |
| {input_article} | |
| # Similarity Analysis | |
| {similarity_analysis} | |
| # Known Hoax Matches | |
| {hoax_matches} | |
| # Web Search Results | |
| {web_results} | |
| Determine if this article is a hoax or real news. Provide: | |
| 1. Final verdict (HOAX or REAL) | |
| 2. Confidence score (0-100%) | |
| 3. Detailed justification for your decision | |
| """, | |
| ), | |
| ] | |
| ) | |
| # Extract relevant data for the prompt | |
| hoax_matches_text = json.dumps( | |
| [ | |
| {"content": r["content"], "similarity": r["similarity_score"]} | |
| for r in hoax_results | |
| ], | |
| indent=2, | |
| ) | |
| web_results_text = json.dumps(web_results, indent=2) | |
| similarity_analysis_text = json.dumps(similarity_analysis, indent=2) | |
| # Get response from LLM | |
| chain = judge_prompt | llm | |
| response = chain.invoke( | |
| { | |
| "input_article": input_article, | |
| "similarity_analysis": similarity_analysis_text, | |
| "hoax_matches": hoax_matches_text, | |
| "web_results": web_results_text, | |
| } | |
| ) | |
| # Parse the LLM response to extract verdict and confidence | |
| content = response.content | |
| # Simplified parsing (in a real implementation, use more robust parsing) | |
| verdict = "HOAX" if "HOAX" in content.upper() else "REAL" | |
| # Extract confidence score (simplified) | |
| confidence = 0 | |
| for line in content.split("\n"): | |
| if "confidence" in line.lower() and "%" in line: | |
| try: | |
| confidence = int("".join(filter(str.isdigit, line.split("%")[0][-3:]))) | |
| except: | |
| confidence = 80 # Default if parsing fails | |
| return {"verdict": verdict, "confidence": confidence, "justification": content} | |
| # Node functions for the graph | |
| def retrieve_from_databases(state: AgentState) -> AgentState: | |
| """Retrieve information from hoax database and web search.""" | |
| hoax_db = setup_hoax_database("hoax_news_data.csv") # Assuming this file exists | |
| # Query hoax database | |
| hoax_results = query_hoax_database(state.input_article, hoax_db) | |
| # Search web | |
| web_results = search_web(state.input_article) | |
| return AgentState( | |
| **{ | |
| **state.dict(), | |
| "hoax_db_results": hoax_results, | |
| "web_search_results": web_results, | |
| } | |
| ) | |
| def analyze_and_assess(state: AgentState) -> AgentState: | |
| """Analyze similarity and provide initial assessment.""" | |
| analysis = analyze_similarity( | |
| state.input_article, state.hoax_db_results, state.web_search_results | |
| ) | |
| return AgentState( | |
| **{ | |
| **state.dict(), | |
| "similarity_scores": analysis["similarity_scores"], | |
| "initial_assessment": {"indication": analysis["initial_indication"]}, | |
| } | |
| ) | |
| def make_final_judgment(state: AgentState) -> AgentState: | |
| """Make final judgment using LLM-as-a-Judge.""" | |
| verdict = llm_judge( | |
| state.input_article, | |
| state.hoax_db_results, | |
| state.web_search_results, | |
| { | |
| "similarity_scores": state.similarity_scores, | |
| "initial_assessment": state.initial_assessment, | |
| }, | |
| ) | |
| return AgentState(**{**state.dict(), "final_verdict": verdict}) | |
| # Create the Hoax Detection workflow using LangGraph | |
| def create_hoax_detection_workflow() -> AgentExecutor: | |
| """Create the full Hoax Detection workflow using LangGraph.""" | |
| # Define the graph | |
| workflow = StateGraph(AgentState) | |
| # Add nodes | |
| workflow.add_node("retrieve", retrieve_from_databases) | |
| workflow.add_node("analyze", analyze_and_assess) | |
| workflow.add_node("judge", make_final_judgment) | |
| # Add edges | |
| workflow.add_edge("retrieve", "analyze") | |
| workflow.add_edge("analyze", "judge") | |
| workflow.add_edge("judge", END) | |
| # Set entry point | |
| workflow.set_entry_point("retrieve") | |
| # Compile the graph | |
| hoax_detection_app = workflow.compile() | |
| return hoax_detection_app | |
| # Main function to run the hoax detection | |
| def detect_hoax(article_text: str) -> Dict: | |
| """Run the full hoax detection process on an article.""" | |
| # Create the workflow | |
| hoax_detector = create_hoax_detection_workflow() | |
| # Initialize state with input article | |
| initial_state = AgentState(input_article=article_text) | |
| # Run the workflow | |
| final_state = hoax_detector.invoke(initial_state) | |
| # Return the final verdict | |
| return { | |
| "classification": final_state.final_verdict["verdict"], | |
| "confidence": final_state.final_verdict["confidence"], | |
| "justification": final_state.final_verdict["justification"], | |
| } | |
| # Example usage | |
| if __name__ == "__main__": | |
| # Sample article for testing | |
| test_article = """ | |
| BREAKING: Scientists Discover Microchips in COVID-19 Vaccines | |
| Researchers at a leading independent laboratory have found microscopic tracking devices | |
| in samples of COVID-19 vaccines. These microchips, barely visible under an electron | |
| microscope, are allegedly capable of tracking individuals and transmitting data to | |
| satellite systems. Government officials have declined to comment on these findings, | |
| raising further suspicions about the true purpose of the global vaccination campaign. | |
| The discovery was made by Dr. James Wilson, who claims he was subsequently threatened | |
| with losing his research funding if he publicized the results. Several other scientists | |
| have allegedly corroborated these findings but remain anonymous out of fear for their careers. | |
| """ | |
| # Run the detection | |
| result = detect_hoax(test_article) | |
| # Print results | |
| print("\n===== HOAX DETECTION RESULTS =====") | |
| print(f"Classification: {result['classification']}") | |
| print(f"Confidence: {result['confidence']}%") | |
| print("\nJustification:") | |
| print(result["justification"]) | |