Francisco Zanartu commited on
Commit Β·
81b8253
1
Parent(s): 2c74e8f
feat: implement Gradio interface for misinformation detection and enhance document analysis functionality
Browse files- .gitignore +8 -0
- main.py +144 -3
- pyproject.toml +1 -0
.gitignore
CHANGED
|
@@ -8,3 +8,11 @@ wheels/
|
|
| 8 |
|
| 9 |
# Virtual environments
|
| 10 |
.venv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
# Virtual environments
|
| 10 |
.venv
|
| 11 |
+
service-account.json
|
| 12 |
+
*.ipynb
|
| 13 |
+
.ipynb_checkpoints/
|
| 14 |
+
data/
|
| 15 |
+
notebooks/
|
| 16 |
+
.env
|
| 17 |
+
.continue/
|
| 18 |
+
.vscode/
|
main.py
CHANGED
|
@@ -1,6 +1,147 @@
|
|
| 1 |
-
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
if __name__ == "__main__":
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Simplified Gradio interface for misinformation detection.
|
| 3 |
+
This is the minimal version for quick prototyping.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
import pprint
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import gradio as gr
|
| 10 |
+
from src.utils.parser import MarkdownConverter
|
| 11 |
+
from src.utils.chunking import get_base_chunks
|
| 12 |
+
from src.api.apis import classify_text
|
| 13 |
+
from src.api.rebuttal import RebuttalStructure
|
| 14 |
+
|
| 15 |
+
logging.basicConfig(
|
| 16 |
+
format="%(asctime)s %(levelname)s %(message)s",
|
| 17 |
+
datefmt="%m/%d/%Y %I:%M:%S %p",
|
| 18 |
+
level=logging.INFO,
|
| 19 |
+
)
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def analyze_document(pdf_file, url_text):
|
| 24 |
+
"""
|
| 25 |
+
Simple analysis function that processes PDF or URL input.
|
| 26 |
+
|
| 27 |
+
Returns formatted HTML with highlighted misinformation and rebuttals.
|
| 28 |
+
"""
|
| 29 |
+
# Step 1: Convert to markdown
|
| 30 |
+
md = MarkdownConverter()
|
| 31 |
+
|
| 32 |
+
if pdf_file is not None:
|
| 33 |
+
logger.info(f"π Processing PDF file: {pdf_file}")
|
| 34 |
+
document = md.run(pdf_file)
|
| 35 |
+
source = Path(pdf_file).name
|
| 36 |
+
elif url_text and url_text.strip():
|
| 37 |
+
logger.info(f"π Processing URL: {url_text}")
|
| 38 |
+
document = md.run(url_text)
|
| 39 |
+
source = url_text
|
| 40 |
+
else:
|
| 41 |
+
return "<p style='color: red;'>β Please provide a PDF or URL</p>"
|
| 42 |
+
|
| 43 |
+
# Step 2: Chunk the document
|
| 44 |
+
chunks = get_base_chunks(document, chunk_size=1000, chunk_overlap=200)
|
| 45 |
+
logger.info(f"β
Created {len(chunks)} chunks")
|
| 46 |
+
logger.info(f"β
First chunk: {pprint.pformat(chunks[0], indent=4)}")
|
| 47 |
+
|
| 48 |
+
# Step 3: Classify chunks
|
| 49 |
+
# Handle both Document objects and dicts
|
| 50 |
+
responses = []
|
| 51 |
+
for chunk in chunks:
|
| 52 |
+
# Extract text from Document object or dict
|
| 53 |
+
if hasattr(chunk, "page_content"):
|
| 54 |
+
# LangChain Document object
|
| 55 |
+
chunk_text = chunk.page_content
|
| 56 |
+
elif isinstance(chunk, dict):
|
| 57 |
+
# Dictionary format
|
| 58 |
+
chunk_text = chunk.get("metadata", {}).get("chunk") or chunk.get("text", "")
|
| 59 |
+
else:
|
| 60 |
+
# Fallback: try to convert to string
|
| 61 |
+
chunk_text = str(chunk)
|
| 62 |
+
|
| 63 |
+
if chunk_text:
|
| 64 |
+
responses.append(classify_text(chunk_text))
|
| 65 |
+
|
| 66 |
+
# Step 4: Keep only positive misinformation detections
|
| 67 |
+
positive_responses = [r for r in responses if r.category != "0"]
|
| 68 |
+
|
| 69 |
+
# If no misinformation found
|
| 70 |
+
if not positive_responses:
|
| 71 |
+
return f"""
|
| 72 |
+
<div style='padding: 30px; background: #d4edda; border-radius: 8px; text-align: center;'>
|
| 73 |
+
<h2>β
No Misinformation Detected</h2>
|
| 74 |
+
<p>Source: {source}</p>
|
| 75 |
+
</div>
|
| 76 |
+
"""
|
| 77 |
+
|
| 78 |
+
# Step 5: Generate rebuttals
|
| 79 |
+
rebuttal_gen = RebuttalStructure()
|
| 80 |
+
rebuttals = [rebuttal_gen.run(misinfo) for misinfo in positive_responses]
|
| 81 |
+
|
| 82 |
+
# Step 6: Build output HTML
|
| 83 |
+
output = f"""
|
| 84 |
+
<div style='max-width: 900px; margin: 0 auto; padding: 20px; font-family: Arial, sans-serif;'>
|
| 85 |
+
<h2>π Misinformation Analysis</h2>
|
| 86 |
+
<p><strong>Source:</strong> {source}</p>
|
| 87 |
+
<p><strong>Issues Found:</strong> {len(positive_responses)} out of {len(chunks)} sections</p>
|
| 88 |
+
<hr>
|
| 89 |
+
"""
|
| 90 |
+
|
| 91 |
+
# Map responses to chunks
|
| 92 |
+
response_index = 0
|
| 93 |
+
for i, chunk in enumerate(chunks):
|
| 94 |
+
# Extract text from Document object or dict
|
| 95 |
+
if hasattr(chunk, "page_content"):
|
| 96 |
+
chunk_text = chunk.page_content
|
| 97 |
+
elif isinstance(chunk, dict):
|
| 98 |
+
chunk_text = chunk.get("metadata", {}).get("chunk") or chunk.get("text", "")
|
| 99 |
+
else:
|
| 100 |
+
chunk_text = str(chunk)
|
| 101 |
+
|
| 102 |
+
# Check if this chunk has misinformation
|
| 103 |
+
if i < len(responses) and responses[i].category != "0":
|
| 104 |
+
# Highlighted section with rebuttal
|
| 105 |
+
output += f"""
|
| 106 |
+
<div style='background: #fff3cd; padding: 20px; margin: 20px 0; border-left: 4px solid #ff9800; border-radius: 4px;'>
|
| 107 |
+
<p style='margin: 0; font-size: 16px;'>{chunk_text}</p>
|
| 108 |
+
<div style='margin-top: 15px; padding: 15px; background: white; border-left: 3px solid #2196F3; border-radius: 4px;'>
|
| 109 |
+
<strong style='color: #2196F3;'>π Fact Check:</strong>
|
| 110 |
+
<p style='margin: 5px 0 0 0;'>{rebuttals[response_index]}</p>
|
| 111 |
+
</div>
|
| 112 |
+
</div>
|
| 113 |
+
"""
|
| 114 |
+
response_index += 1
|
| 115 |
+
else:
|
| 116 |
+
# Normal section
|
| 117 |
+
output += f"""
|
| 118 |
+
<div style='padding: 15px; margin: 15px 0; background: #f5f5f5; border-radius: 4px;'>
|
| 119 |
+
<p style='margin: 0;'>{chunk_text}</p>
|
| 120 |
+
</div>
|
| 121 |
+
"""
|
| 122 |
+
|
| 123 |
+
output += "</div>"
|
| 124 |
+
return output
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# Create simple Gradio interface
|
| 128 |
+
demo = gr.Interface(
|
| 129 |
+
fn=analyze_document,
|
| 130 |
+
inputs=[
|
| 131 |
+
gr.File(label="π Upload PDF (optional)", file_types=[".pdf"]),
|
| 132 |
+
gr.Textbox(
|
| 133 |
+
label="π Or Enter URL (optional)",
|
| 134 |
+
placeholder="https://example.com/article",
|
| 135 |
+
),
|
| 136 |
+
],
|
| 137 |
+
outputs=gr.HTML(label="Analysis Results"),
|
| 138 |
+
title="π Misinformation Detector",
|
| 139 |
+
description="Upload a PDF or enter a URL to detect and fact-check potential misinformation.",
|
| 140 |
+
)
|
| 141 |
|
| 142 |
|
| 143 |
if __name__ == "__main__":
|
| 144 |
+
demo.launch(
|
| 145 |
+
server_name="127.0.0.1", # Use 127.0.0.1 instead of 0.0.0.0 for Safari
|
| 146 |
+
server_port=7860,
|
| 147 |
+
)
|
pyproject.toml
CHANGED
|
@@ -13,6 +13,7 @@ dependencies = [
|
|
| 13 |
"langchain-google-genai>=4.1.2",
|
| 14 |
"langchain-openai>=1.1.6",
|
| 15 |
"langchain-text-splitters>=1.1.0",
|
|
|
|
| 16 |
"pip>=25.3",
|
| 17 |
"python-dotenv>=1.2.1",
|
| 18 |
"requests>=2.32.5",
|
|
|
|
| 13 |
"langchain-google-genai>=4.1.2",
|
| 14 |
"langchain-openai>=1.1.6",
|
| 15 |
"langchain-text-splitters>=1.1.0",
|
| 16 |
+
"markdown>=3.10",
|
| 17 |
"pip>=25.3",
|
| 18 |
"python-dotenv>=1.2.1",
|
| 19 |
"requests>=2.32.5",
|