Mohammedmarzuk17 commited on
Commit
70fee5e
·
verified ·
1 Parent(s): 7d40d48

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -50
app.py CHANGED
@@ -4,11 +4,17 @@ import requests, feedparser, time, threading, re, json, os
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  from concurrent.futures import ThreadPoolExecutor
 
7
 
8
  # ---------------------------
9
- # Load Models
10
  # ---------------------------
 
 
11
 
 
 
 
12
  claim_model_name = "MoritzLaurer/DeBERTa-v3-base-mnli"
13
  claim_classifier = pipeline("zero-shot-classification", model=claim_model_name)
14
  claim_labels = ["factual claim", "opinion", "personal anecdote", "other"]
@@ -22,7 +28,6 @@ nli_pipeline = pipeline("text-classification", model=nli_model_name, tokenizer=n
22
  # ---------------------------
23
  # Evidence Sources
24
  # ---------------------------
25
-
26
  RSS_FEEDS = [
27
  "https://www.snopes.com/feed/",
28
  "https://www.politifact.com/rss/factchecks/",
@@ -38,7 +43,6 @@ CACHE_TTL = 60 * 60 * 3 # 3 hours
38
  # ---------------------------
39
  # Google Fact-Check API Setup
40
  # ---------------------------
41
-
42
  GOOGLE_API_KEY = "AIzaSyAC56onKwR17zd_djUPEfGXQACy9qRjDxw"
43
  GOOGLE_QUERY_LIMIT = 95
44
  COUNTER_FILE = "/tmp/google_fc_counter.json"
@@ -71,28 +75,31 @@ reset_daily_google_counter()
71
  def google_fact_check(claim):
72
  reset_daily_google_counter()
73
  if claim in google_cache:
74
- return google_cache[claim]
75
- if google_counter["count"] >= GOOGLE_QUERY_LIMIT:
76
- return []
77
- try:
78
- url = f"https://factchecktools.googleapis.com/v1alpha1/claims:search?query={claim}&key={GOOGLE_API_KEY}"
79
- resp = requests.get(url, timeout=5)
80
- google_counter["count"] += 1
81
- save_json_cache(COUNTER_FILE, google_counter)
82
- if resp.status_code == 200:
83
- results = resp.json().get("claims", [])
84
- processed = [c.get("text", "")[:250]+"..." if len(c.get("text",""))>250 else c.get("text","") for c in results]
85
- google_cache[claim] = processed
86
- save_json_cache(GOOGLE_CACHE_FILE, google_cache)
87
- return processed
88
- except Exception as e:
89
- print(f"Google Fact-Check API error: {e}")
90
- return []
 
 
 
 
91
 
92
  # ---------------------------
93
  # Helpers
94
  # ---------------------------
95
-
96
  def clean_text(text):
97
  text = re.sub(r'<img.*?>', '', text)
98
  text = re.sub(r'<.*?>', '', text)
@@ -132,9 +139,40 @@ def start_rss_refresher():
132
  t.start()
133
 
134
  # ---------------------------
135
- # Semantic RSS Matching
 
 
 
 
 
 
 
 
 
136
  # ---------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
 
 
 
138
  def match_rss_semantic(claim, top_k=2):
139
  if not RSS_CACHE:
140
  return []
@@ -142,6 +180,9 @@ def match_rss_semantic(claim, top_k=2):
142
  texts = [a["summary"] for a in RSS_CACHE]
143
  titles = [a["title"] for a in RSS_CACHE]
144
 
 
 
 
145
  vectorizer = TfidfVectorizer(stop_words='english')
146
  tfidf_matrix = vectorizer.fit_transform([claim] + texts)
147
  cosine_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
@@ -150,7 +191,7 @@ def match_rss_semantic(claim, top_k=2):
150
  matched = []
151
  matched_titles = []
152
  for i in top_indices:
153
- if cosine_scores[i] > 0.1:
154
  matched.append(texts[i])
155
  matched_titles.append(titles[i])
156
 
@@ -163,31 +204,9 @@ def match_rss_semantic(claim, top_k=2):
163
 
164
  return matched
165
 
166
- # ---------------------------
167
- # Wikipedia Summary
168
- # ---------------------------
169
-
170
- def get_wikipedia_summary(query):
171
- summary = ""
172
- try:
173
- url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{query.replace(' ', '_')}"
174
- resp = requests.get(url, timeout=5)
175
- if resp.status_code == 200:
176
- summary = clean_text(resp.json().get("extract", ""))
177
- except Exception:
178
- pass
179
-
180
- if summary:
181
- print(f"\nClaim: {query}\nWikipedia Summary: {summary[:300]}...")
182
- else:
183
- print(f"\nClaim: {query}\nNo Wikipedia summary found.")
184
-
185
- return summary
186
-
187
  # ---------------------------
188
  # Claim Extraction
189
  # ---------------------------
190
-
191
  def extract_claims(page_text):
192
  sentences = re.split(r'(?<=[.!?;\n])\s+', page_text) if page_text else []
193
  results, seen = [], set()
@@ -207,7 +226,6 @@ def extract_claims(page_text):
207
  # ---------------------------
208
  # AI Detection
209
  # ---------------------------
210
-
211
  def detect_ai(texts):
212
  if isinstance(texts, str):
213
  texts = [texts]
@@ -222,7 +240,6 @@ def detect_ai(texts):
222
  # ---------------------------
223
  # Fact-Checking with Threaded NLI + Google
224
  # ---------------------------
225
-
226
  def process_evidence_pair(claim, evidence):
227
  key = f"{claim}||{evidence}"
228
  if key in nli_cache:
@@ -292,7 +309,6 @@ def fact_check_with_sources(claims):
292
  # ---------------------------
293
  # Predict
294
  # ---------------------------
295
-
296
  def predict(page_text=""):
297
  claims = extract_claims(page_text)
298
  ai_results = detect_ai(claims) if claims else []
@@ -306,7 +322,6 @@ def predict(page_text=""):
306
  # ---------------------------
307
  # Gradio UI
308
  # ---------------------------
309
-
310
  with gr.Blocks() as demo:
311
  gr.Markdown("## EduShield AI Backend - Predict API & UI")
312
  with gr.Tab("Predict"):
@@ -328,7 +343,6 @@ with gr.Blocks() as demo:
328
  # ---------------------------
329
  # Launch
330
  # ---------------------------
331
-
332
  if __name__ == "__main__":
333
  refresh_rss_cache(force=True)
334
  start_rss_refresher()
 
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  from concurrent.futures import ThreadPoolExecutor
7
+ import nltk
8
 
9
  # ---------------------------
10
+ # NLTK setup for keyword extraction
11
  # ---------------------------
12
+ nltk.download('punkt')
13
+ nltk.download('averaged_perceptron_tagger')
14
 
15
+ # ---------------------------
16
+ # Load Models
17
+ # ---------------------------
18
  claim_model_name = "MoritzLaurer/DeBERTa-v3-base-mnli"
19
  claim_classifier = pipeline("zero-shot-classification", model=claim_model_name)
20
  claim_labels = ["factual claim", "opinion", "personal anecdote", "other"]
 
28
  # ---------------------------
29
  # Evidence Sources
30
  # ---------------------------
 
31
  RSS_FEEDS = [
32
  "https://www.snopes.com/feed/",
33
  "https://www.politifact.com/rss/factchecks/",
 
43
  # ---------------------------
44
  # Google Fact-Check API Setup
45
  # ---------------------------
 
46
  GOOGLE_API_KEY = "AIzaSyAC56onKwR17zd_djUPEfGXQACy9qRjDxw"
47
  GOOGLE_QUERY_LIMIT = 95
48
  COUNTER_FILE = "/tmp/google_fc_counter.json"
 
75
  def google_fact_check(claim):
76
  reset_daily_google_counter()
77
  if claim in google_cache:
78
+ hits = google_cache[claim]
79
+ elif google_counter["count"] >= GOOGLE_QUERY_LIMIT:
80
+ hits = []
81
+ else:
82
+ hits = []
83
+ try:
84
+ url = f"https://factchecktools.googleapis.com/v1alpha1/claims:search?query={claim}&key={GOOGLE_API_KEY}"
85
+ resp = requests.get(url, timeout=5)
86
+ google_counter["count"] += 1
87
+ save_json_cache(COUNTER_FILE, google_counter)
88
+ if resp.status_code == 200:
89
+ results = resp.json().get("claims", [])
90
+ hits = [c.get("text", "")[:250]+"..." if len(c.get("text",""))>250 else c.get("text","") for c in results]
91
+ except Exception as e:
92
+ print(f"Google Fact-Check API error: {e}")
93
+
94
+ google_cache[claim] = hits
95
+ save_json_cache(GOOGLE_CACHE_FILE, google_cache)
96
+
97
+ print(f"\nClaim: {claim}\nGoogle Fact-Check Hits: {hits if hits else 'None'}")
98
+ return hits
99
 
100
  # ---------------------------
101
  # Helpers
102
  # ---------------------------
 
103
  def clean_text(text):
104
  text = re.sub(r'<img.*?>', '', text)
105
  text = re.sub(r'<.*?>', '', text)
 
139
  t.start()
140
 
141
  # ---------------------------
142
+ # Keyword Extraction
143
+ # ---------------------------
144
+ def extract_keywords(sentence):
145
+ words = nltk.word_tokenize(sentence)
146
+ pos_tags = nltk.pos_tag(words)
147
+ keywords = [w for w, pos in pos_tags if pos.startswith('NN') or pos.startswith('JJ')]
148
+ return keywords[:5] if keywords else words[:5]
149
+
150
+ # ---------------------------
151
+ # Wikipedia Summary
152
  # ---------------------------
153
+ def get_wikipedia_summary(query):
154
+ summary = ""
155
+ keywords = extract_keywords(query)
156
+ search_variants = ['_'.join(keywords), query.replace(' ', '_')]
157
+ for variant in search_variants:
158
+ try:
159
+ url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{variant}"
160
+ resp = requests.get(url, timeout=5)
161
+ if resp.status_code == 200:
162
+ summary = clean_text(resp.json().get("extract", ""))
163
+ if summary:
164
+ break
165
+ except Exception:
166
+ continue
167
+ if summary:
168
+ print(f"\nClaim: {query}\nWikipedia Summary: {summary[:300]}...")
169
+ else:
170
+ print(f"\nClaim: {query}\nNo Wikipedia summary found.")
171
+ return summary
172
 
173
+ # ---------------------------
174
+ # RSS Semantic + Keyword Matching
175
+ # ---------------------------
176
  def match_rss_semantic(claim, top_k=2):
177
  if not RSS_CACHE:
178
  return []
 
180
  texts = [a["summary"] for a in RSS_CACHE]
181
  titles = [a["title"] for a in RSS_CACHE]
182
 
183
+ claim_keywords = extract_keywords(claim)
184
+ keyword_pattern = '|'.join(claim_keywords).lower()
185
+
186
  vectorizer = TfidfVectorizer(stop_words='english')
187
  tfidf_matrix = vectorizer.fit_transform([claim] + texts)
188
  cosine_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
 
191
  matched = []
192
  matched_titles = []
193
  for i in top_indices:
194
+ if cosine_scores[i] > 0.1 or any(k in texts[i].lower() for k in claim_keywords):
195
  matched.append(texts[i])
196
  matched_titles.append(titles[i])
197
 
 
204
 
205
  return matched
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  # ---------------------------
208
  # Claim Extraction
209
  # ---------------------------
 
210
  def extract_claims(page_text):
211
  sentences = re.split(r'(?<=[.!?;\n])\s+', page_text) if page_text else []
212
  results, seen = [], set()
 
226
  # ---------------------------
227
  # AI Detection
228
  # ---------------------------
 
229
  def detect_ai(texts):
230
  if isinstance(texts, str):
231
  texts = [texts]
 
240
  # ---------------------------
241
  # Fact-Checking with Threaded NLI + Google
242
  # ---------------------------
 
243
  def process_evidence_pair(claim, evidence):
244
  key = f"{claim}||{evidence}"
245
  if key in nli_cache:
 
309
  # ---------------------------
310
  # Predict
311
  # ---------------------------
 
312
  def predict(page_text=""):
313
  claims = extract_claims(page_text)
314
  ai_results = detect_ai(claims) if claims else []
 
322
  # ---------------------------
323
  # Gradio UI
324
  # ---------------------------
 
325
  with gr.Blocks() as demo:
326
  gr.Markdown("## EduShield AI Backend - Predict API & UI")
327
  with gr.Tab("Predict"):
 
343
  # ---------------------------
344
  # Launch
345
  # ---------------------------
 
346
  if __name__ == "__main__":
347
  refresh_rss_cache(force=True)
348
  start_rss_refresher()