Mohammedmarzuk17 commited on
Commit
5e7b159
·
verified ·
1 Parent(s): d043e29

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -41
app.py CHANGED
@@ -6,10 +6,12 @@ from concurrent.futures import ThreadPoolExecutor
6
  # ---------------------------
7
  # Load Models
8
  # ---------------------------
 
9
  claim_model_name = "MoritzLaurer/DeBERTa-v3-base-mnli"
10
  claim_classifier = pipeline("zero-shot-classification", model=claim_model_name, device=-1)
11
  claim_labels = ["factual claim", "opinion", "personal anecdote", "other"]
12
 
 
13
  ai_detect_model_name = "roberta-base-openai-detector"
14
  ai_detector = pipeline("text-classification", model=ai_detect_model_name, device=-1)
15
 
@@ -23,67 +25,66 @@ google_quota = {"count": 0, "date": datetime.date.today()}
23
  GOOGLE_DAILY_LIMIT = 100
24
 
25
  # ---------------------------
26
- # Text Splitting (Sentence-based)
27
  # ---------------------------
28
- def split_sentences(text):
29
- """Split text by period into complete sentences."""
30
- sentences = [s.strip() for s in text.split('.') if s.strip()]
31
- return [s + '.' for s in sentences]
 
 
 
32
 
33
  # ---------------------------
34
  # Claim Extraction
35
  # ---------------------------
36
- def extract_claims(page_text, max_claims=20):
37
  """
38
- Extract sentences as claims (not filtered).
39
- - Each sentence from user input is treated as a claim.
40
- - Classify each sentence with DeBERTa model for factuality.
41
  """
42
- sentences = split_sentences(page_text)
43
 
 
44
  def classify_sentence(s):
45
- try:
46
- out = claim_classifier(s, claim_labels)
47
- label_priority = ["factual claim", "opinion", "personal anecdote"]
48
- for lbl in label_priority:
49
- if lbl in out["labels"]:
50
- return {"text": s, "label": lbl, "score": round(out["scores"][out["labels"].index(lbl)], 3)}
51
- except Exception:
52
- pass
53
- return {"text": s, "label": "unknown", "score": 0.0}
54
-
55
  results = []
56
  with ThreadPoolExecutor() as executor:
57
  for r in executor.map(classify_sentence, sentences):
58
- results.append(r)
 
59
 
60
- # Keep full sentences, limit only if too long
61
- results = results[:max_claims]
62
  return results
63
 
64
  # ---------------------------
65
  # AI Text Detection
66
  # ---------------------------
67
  def detect_ai(texts):
68
- """Detect AI-generated or human-written content."""
69
  if isinstance(texts, str):
70
  texts = [texts]
71
  results = []
72
  for t in texts:
73
- try:
74
- out = ai_detector(t)
75
- raw_label = out[0]["label"]
76
- label = "AI-generated" if raw_label.lower() in ["fake", "ai-generated"] else "Human"
77
- results.append({"text": t, "label": label, "score": round(out[0]["score"], 3)})
78
- except Exception:
79
- results.append({"text": t, "label": "error", "score": 0.0})
80
  return results
81
 
82
  # ---------------------------
83
- # Google Search Fact Checking
84
  # ---------------------------
85
  def fetch_google_search(claim):
86
- """Fetch top 3 Google results for a claim."""
87
  global google_quota
88
  today = datetime.date.today()
89
  if google_quota["date"] != today:
@@ -97,7 +98,7 @@ def fetch_google_search(claim):
97
  r = requests.get(url).json()
98
  google_quota["count"] += 1
99
  items = r.get("items", [])
100
- return [f"{item['title']}: {item['snippet']}" for item in items[:3]]
101
  except Exception:
102
  return []
103
 
@@ -107,19 +108,20 @@ def fetch_google_search(claim):
107
  def predict(user_text=""):
108
  """
109
  Runs both:
110
- 1. Full-text analysis (AI detection + single fact-check)
111
- 2. Claim-based analysis (sentence split + AI detection + fact-check)
112
  """
113
  if not user_text.strip():
114
  return {"error": "No text provided."}
115
 
116
- # --- Full Text Analysis ---
117
  full_ai_result = detect_ai(user_text)
118
 
119
- # ✅ FIX: Now fact-check entire input once
120
- full_fact_checking = {"Full text": fetch_google_search(user_text)}
 
121
 
122
- # --- Claim-based Analysis ---
123
  claims_data = extract_claims(user_text)
124
  claims_texts = [c["text"] for c in claims_data]
125
  claims_ai_results = detect_ai(claims_texts) if claims_texts else []
@@ -129,7 +131,7 @@ def predict(user_text=""):
129
  "full_text": {
130
  "input": user_text,
131
  "ai_detection": full_ai_result,
132
- "fact_checking": full_fact_checking # full input fact-checked once
133
  },
134
  "claims": claims_data,
135
  "claims_ai_detection": claims_ai_results,
 
6
  # ---------------------------
7
  # Load Models
8
  # ---------------------------
9
+ # Claim Extraction → Zero-Shot Classifier (DeBERTa MNLI)
10
  claim_model_name = "MoritzLaurer/DeBERTa-v3-base-mnli"
11
  claim_classifier = pipeline("zero-shot-classification", model=claim_model_name, device=-1)
12
  claim_labels = ["factual claim", "opinion", "personal anecdote", "other"]
13
 
14
+ # AI Text Detection → OpenAI Detector (Roberta-based)
15
  ai_detect_model_name = "roberta-base-openai-detector"
16
  ai_detector = pipeline("text-classification", model=ai_detect_model_name, device=-1)
17
 
 
25
  GOOGLE_DAILY_LIMIT = 100
26
 
27
  # ---------------------------
28
+ # Safe Split Helpers
29
  # ---------------------------
30
+ def safe_split_text(text):
31
+ """
32
+ Split text safely on '.' or ',' or ';'
33
+ but do NOT split when between numbers (e.g., 1.41, 1,200, $1,200).
34
+ """
35
+ pattern = r'(?<!\d)[.](?!\d)|(?<![\d\$]),(?!\d)|;'
36
+ return [s.strip() for s in re.split(pattern, text) if len(s.strip().split()) > 4]
37
 
38
  # ---------------------------
39
  # Claim Extraction
40
  # ---------------------------
41
+ def extract_claims(page_text, max_claims=20, batch_size=50):
42
  """
43
+ Extract top claims from text:
44
+ - Uses safe_split_text for splitting.
45
+ - Classifies each piece into factual claim, opinion, or anecdote.
46
  """
47
+ sentences = safe_split_text(page_text)
48
 
49
+ # Step 1: Function to classify a single sentence
50
  def classify_sentence(s):
51
+ out = claim_classifier(s, claim_labels)
52
+ label_priority = ["factual claim", "opinion", "personal anecdote"]
53
+ for lbl in label_priority:
54
+ if lbl in out["labels"]:
55
+ return {"text": s, "label": lbl, "score": round(out["scores"][out["labels"].index(lbl)], 3)}
56
+ return None
57
+
58
+ # Step 2: Threaded classification
 
 
59
  results = []
60
  with ThreadPoolExecutor() as executor:
61
  for r in executor.map(classify_sentence, sentences):
62
+ if r:
63
+ results.append(r)
64
 
65
+ # Step 3: Limit top claims
66
+ results = sorted(results, key=lambda x: -len(x["text"]))[:max_claims]
67
  return results
68
 
69
  # ---------------------------
70
  # AI Text Detection
71
  # ---------------------------
72
  def detect_ai(texts):
73
+ """Detect whether input text is AI-generated or human-written."""
74
  if isinstance(texts, str):
75
  texts = [texts]
76
  results = []
77
  for t in texts:
78
+ out = ai_detector(t)
79
+ raw_label = out[0]["label"]
80
+ label = "AI-generated" if raw_label.lower() in ["fake", "ai-generated"] else "Human"
81
+ results.append({"text": t, "label": label, "score": round(out[0]["score"], 3)})
 
 
 
82
  return results
83
 
84
  # ---------------------------
85
+ # Google Evidence Gathering
86
  # ---------------------------
87
  def fetch_google_search(claim):
 
88
  global google_quota
89
  today = datetime.date.today()
90
  if google_quota["date"] != today:
 
98
  r = requests.get(url).json()
99
  google_quota["count"] += 1
100
  items = r.get("items", [])
101
+ return [f"{item['title']}: {item['snippet']}" for item in items[:3]] # top 3 results
102
  except Exception:
103
  return []
104
 
 
108
  def predict(user_text=""):
109
  """
110
  Runs both:
111
+ 1. Full-text analysis (AI detection on entire input + sentence-based fact-check)
112
+ 2. Claim-extracted analysis (claim split + AI detection + fact-check)
113
  """
114
  if not user_text.strip():
115
  return {"error": "No text provided."}
116
 
117
+ # --- Full text analysis ---
118
  full_ai_result = detect_ai(user_text)
119
 
120
+ # NEW: Split strictly by '.' to preserve full user input sentences
121
+ dot_sentences = [s.strip() for s in user_text.split('.') if s.strip()]
122
+ full_fact_checking = {s: fetch_google_search(s) for s in dot_sentences}
123
 
124
+ # --- Claim-based analysis ---
125
  claims_data = extract_claims(user_text)
126
  claims_texts = [c["text"] for c in claims_data]
127
  claims_ai_results = detect_ai(claims_texts) if claims_texts else []
 
131
  "full_text": {
132
  "input": user_text,
133
  "ai_detection": full_ai_result,
134
+ "fact_checking": full_fact_checking
135
  },
136
  "claims": claims_data,
137
  "claims_ai_detection": claims_ai_results,