Francisco Zanartu commited on
Commit
81b8253
Β·
1 Parent(s): 2c74e8f

feat: implement Gradio interface for misinformation detection and enhance document analysis functionality

Browse files
Files changed (3) hide show
  1. .gitignore +8 -0
  2. main.py +144 -3
  3. pyproject.toml +1 -0
.gitignore CHANGED
@@ -8,3 +8,11 @@ wheels/
8
 
9
  # Virtual environments
10
  .venv
 
 
 
 
 
 
 
 
 
8
 
9
  # Virtual environments
10
  .venv
11
+ service-account.json
12
+ *.ipynb
13
+ .ipynb_checkpoints/
14
+ data/
15
+ notebooks/
16
+ .env
17
+ .continue/
18
+ .vscode/
main.py CHANGED
@@ -1,6 +1,147 @@
1
- def main():
2
- print("Hello from fhoc!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  if __name__ == "__main__":
6
- main()
 
 
 
 
1
+ """
2
+ Simplified Gradio interface for misinformation detection.
3
+ This is the minimal version for quick prototyping.
4
+ """
5
+
6
+ import logging
7
+ import pprint
8
+ from pathlib import Path
9
+ import gradio as gr
10
+ from src.utils.parser import MarkdownConverter
11
+ from src.utils.chunking import get_base_chunks
12
+ from src.api.apis import classify_text
13
+ from src.api.rebuttal import RebuttalStructure
14
+
15
+ logging.basicConfig(
16
+ format="%(asctime)s %(levelname)s %(message)s",
17
+ datefmt="%m/%d/%Y %I:%M:%S %p",
18
+ level=logging.INFO,
19
+ )
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def analyze_document(pdf_file, url_text):
24
+ """
25
+ Simple analysis function that processes PDF or URL input.
26
+
27
+ Returns formatted HTML with highlighted misinformation and rebuttals.
28
+ """
29
+ # Step 1: Convert to markdown
30
+ md = MarkdownConverter()
31
+
32
+ if pdf_file is not None:
33
+ logger.info(f"πŸ“„ Processing PDF file: {pdf_file}")
34
+ document = md.run(pdf_file)
35
+ source = Path(pdf_file).name
36
+ elif url_text and url_text.strip():
37
+ logger.info(f"πŸ”— Processing URL: {url_text}")
38
+ document = md.run(url_text)
39
+ source = url_text
40
+ else:
41
+ return "<p style='color: red;'>❌ Please provide a PDF or URL</p>"
42
+
43
+ # Step 2: Chunk the document
44
+ chunks = get_base_chunks(document, chunk_size=1000, chunk_overlap=200)
45
+ logger.info(f"βœ… Created {len(chunks)} chunks")
46
+ logger.info(f"βœ… First chunk: {pprint.pformat(chunks[0], indent=4)}")
47
+
48
+ # Step 3: Classify chunks
49
+ # Handle both Document objects and dicts
50
+ responses = []
51
+ for chunk in chunks:
52
+ # Extract text from Document object or dict
53
+ if hasattr(chunk, "page_content"):
54
+ # LangChain Document object
55
+ chunk_text = chunk.page_content
56
+ elif isinstance(chunk, dict):
57
+ # Dictionary format
58
+ chunk_text = chunk.get("metadata", {}).get("chunk") or chunk.get("text", "")
59
+ else:
60
+ # Fallback: try to convert to string
61
+ chunk_text = str(chunk)
62
+
63
+ if chunk_text:
64
+ responses.append(classify_text(chunk_text))
65
+
66
+ # Step 4: Keep only positive misinformation detections
67
+ positive_responses = [r for r in responses if r.category != "0"]
68
+
69
+ # If no misinformation found
70
+ if not positive_responses:
71
+ return f"""
72
+ <div style='padding: 30px; background: #d4edda; border-radius: 8px; text-align: center;'>
73
+ <h2>βœ… No Misinformation Detected</h2>
74
+ <p>Source: {source}</p>
75
+ </div>
76
+ """
77
+
78
+ # Step 5: Generate rebuttals
79
+ rebuttal_gen = RebuttalStructure()
80
+ rebuttals = [rebuttal_gen.run(misinfo) for misinfo in positive_responses]
81
+
82
+ # Step 6: Build output HTML
83
+ output = f"""
84
+ <div style='max-width: 900px; margin: 0 auto; padding: 20px; font-family: Arial, sans-serif;'>
85
+ <h2>πŸ“‹ Misinformation Analysis</h2>
86
+ <p><strong>Source:</strong> {source}</p>
87
+ <p><strong>Issues Found:</strong> {len(positive_responses)} out of {len(chunks)} sections</p>
88
+ <hr>
89
+ """
90
+
91
+ # Map responses to chunks
92
+ response_index = 0
93
+ for i, chunk in enumerate(chunks):
94
+ # Extract text from Document object or dict
95
+ if hasattr(chunk, "page_content"):
96
+ chunk_text = chunk.page_content
97
+ elif isinstance(chunk, dict):
98
+ chunk_text = chunk.get("metadata", {}).get("chunk") or chunk.get("text", "")
99
+ else:
100
+ chunk_text = str(chunk)
101
+
102
+ # Check if this chunk has misinformation
103
+ if i < len(responses) and responses[i].category != "0":
104
+ # Highlighted section with rebuttal
105
+ output += f"""
106
+ <div style='background: #fff3cd; padding: 20px; margin: 20px 0; border-left: 4px solid #ff9800; border-radius: 4px;'>
107
+ <p style='margin: 0; font-size: 16px;'>{chunk_text}</p>
108
+ <div style='margin-top: 15px; padding: 15px; background: white; border-left: 3px solid #2196F3; border-radius: 4px;'>
109
+ <strong style='color: #2196F3;'>πŸ” Fact Check:</strong>
110
+ <p style='margin: 5px 0 0 0;'>{rebuttals[response_index]}</p>
111
+ </div>
112
+ </div>
113
+ """
114
+ response_index += 1
115
+ else:
116
+ # Normal section
117
+ output += f"""
118
+ <div style='padding: 15px; margin: 15px 0; background: #f5f5f5; border-radius: 4px;'>
119
+ <p style='margin: 0;'>{chunk_text}</p>
120
+ </div>
121
+ """
122
+
123
+ output += "</div>"
124
+ return output
125
+
126
+
127
+ # Create simple Gradio interface
128
+ demo = gr.Interface(
129
+ fn=analyze_document,
130
+ inputs=[
131
+ gr.File(label="πŸ“„ Upload PDF (optional)", file_types=[".pdf"]),
132
+ gr.Textbox(
133
+ label="πŸ”— Or Enter URL (optional)",
134
+ placeholder="https://example.com/article",
135
+ ),
136
+ ],
137
+ outputs=gr.HTML(label="Analysis Results"),
138
+ title="πŸ” Misinformation Detector",
139
+ description="Upload a PDF or enter a URL to detect and fact-check potential misinformation.",
140
+ )
141
 
142
 
143
  if __name__ == "__main__":
144
+ demo.launch(
145
+ server_name="127.0.0.1", # Use 127.0.0.1 instead of 0.0.0.0 for Safari
146
+ server_port=7860,
147
+ )
pyproject.toml CHANGED
@@ -13,6 +13,7 @@ dependencies = [
13
  "langchain-google-genai>=4.1.2",
14
  "langchain-openai>=1.1.6",
15
  "langchain-text-splitters>=1.1.0",
 
16
  "pip>=25.3",
17
  "python-dotenv>=1.2.1",
18
  "requests>=2.32.5",
 
13
  "langchain-google-genai>=4.1.2",
14
  "langchain-openai>=1.1.6",
15
  "langchain-text-splitters>=1.1.0",
16
+ "markdown>=3.10",
17
  "pip>=25.3",
18
  "python-dotenv>=1.2.1",
19
  "requests>=2.32.5",