File size: 4,339 Bytes
d074dd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282bbff
 
d074dd5
282bbff
d074dd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b9c581
d074dd5
 
918517f
 
d074dd5
 
 
 
 
 
 
 
 
315679d
 
d074dd5
 
 
5b9c581
d074dd5
9e794eb
d074dd5
5b9c581
d074dd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da3fe65
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import requests
import time
import csv
import pandas as pd
import kagglehub
import gradio as gr
from cerebras.cloud.sdk import Cerebras
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import numpy as np

# Initialize Cerebras API client
Cerekey = os.getenv("Kc")
client = Cerebras(api_key= Cerekey)

Newskey = os.getenv("Nk")

def get_latest_news(query):
    url = f"https://newsapi.org/v2/everything?q={query}&apiKey={Newskey}"
    response = requests.get(url)
    data = response.json()
    return [(article["title"], article["url"], article["source"]["name"]) for article in data.get("articles", [])[:2]]

def update_fact_checks_file(query):
    with open("fact_checks.txt", "w", encoding="utf-8") as file:
        file.write(f"{query}\n")

def create_faiss_retriever():
    if not os.path.exists("fact_checks.txt"):
        open("fact_checks.txt", "w").close()
    
    loader = TextLoader("fact_checks.txt")
    documents = loader.load()
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
    docs = text_splitter.split_documents(documents)
    
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = FAISS.from_documents(docs, embedding_model)
    
    return vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})

def clear_fact_checks_file():
    open("fact_checks.txt", "w").close()

def fact_check_with_llama3(query):
    update_fact_checks_file(query)
    retriever = create_faiss_retriever()
    retrieved_docs = retriever.invoke(query)
    retrieved_texts = [doc.page_content for doc in retrieved_docs]
    news = get_latest_news(query)
    context_text = "\n".join(retrieved_texts)
    
    prompt = f"""
    Claim: {query}
    Context: {context_text}
    Based on the provided context, determine whether the claim is True, False, or Misleading. Provide a concise explanation and cite relevant sources. Don't mention any instance of your knowledge cut-off.
    """
    
    stream = client.chat.completions.create(
        messages=[{"role": "system", "content": prompt}],
        model="llama-3.3-70b",
        stream=True,
        max_completion_tokens=512,
        temperature=0.2,
        top_p=1
    )
    
    result = "".join(chunk.choices[0].delta.content or "" for chunk in stream)
    sources = "\n".join([f"{title} ({source}): {url}" for title, url, source in news])
    clear_fact_checks_file()
    return result, sources if sources else "No relevant sources found."

def map_politifact_label(label):
    label_mapping = {
        "pants-fire": "False",
        "false": "False",
        "half-true": "Misleading",
        "mostly-true": "True",
        "barely-true": "False",
        "true": "True"
    }
    return label_mapping.get(label.lower(), "Unknown")

def evaluate_politifact(csv_file):
    df = pd.read_csv(csv_file.name)
    results = []
    
    for index, row in df.iterrows():
        claim = row["sources_quote"]
        actual_label = map_politifact_label(row["fact"])  # Convert Politifact label to Facto equivalent
        start_time = time.time()
        facto_result, sources = fact_check_with_llama3(claim)
        time_taken = time.time() - start_time
        accuracy = "100" if facto_result.lower() == actual_label.lower() else "0"
        
        results.append([claim, facto_result, actual_label, time_taken, accuracy])
    
    results_df = pd.DataFrame(results, columns=["Claim", "Facto Verdict", "Politifact Verdict", "Time Taken (s)", "Accuracy"])
    output_csv = "fact_check_results.csv"
    results_df.to_csv(output_csv, index=False)
    
    return output_csv

def gradio_interface(csv_file):
    output_csv = evaluate_politifact(csv_file)
    return output_csv

gui = gr.Interface(
    fn=gradio_interface,
    inputs=gr.File(label="Upload Politifact CSV"),
    outputs=gr.File(label="Fact-Check Results CSV"),
    title="Facto - AI Fact-Checking System",
    description="Upload a CSV file with claims, and the system will verify them using Llama 3.3 and compare the results with Politifact."
)

gui.launch(debug=True)