Mohammedmarzuk17 commited on
Commit
0619122
·
verified ·
1 Parent(s): 71be51b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -28
app.py CHANGED
@@ -25,28 +25,28 @@ google_quota = {"count": 0, "date": datetime.date.today()}
25
  GOOGLE_DAILY_LIMIT = 100
26
 
27
  # ---------------------------
28
- # Claim Extraction (Skip splitting numeric/money commas)
 
 
 
 
 
 
 
 
 
 
 
29
  # ---------------------------
30
  def extract_claims(page_text, max_claims=20, batch_size=50):
31
  """
32
  Extract top claims from text:
33
- - Split on '.' first, then split on ',' and ';' but skip numeric/money commas.
34
- - Use zero-shot classification to get factual claim, opinion, or personal anecdote.
35
  """
36
- # Step 1: Split text on '.'
37
- sentences = [s.strip() for s in page_text.split('.') if len(s.strip().split()) > 4]
38
-
39
- # Step 2: Function to safely split a sentence on ',' and ';'
40
- def safe_split(s):
41
- pattern = r'(?<![\d\$]),|;' # avoid commas in numbers like 1,000
42
- chunks = re.split(pattern, s)
43
- return [c.strip() for c in chunks if len(c.split()) > 4]
44
 
45
- refined_sentences = []
46
- for s in sentences:
47
- refined_sentences.extend(safe_split(s))
48
-
49
- # Step 3: Function to classify a single sentence
50
  def classify_sentence(s):
51
  out = claim_classifier(s, claim_labels)
52
  label_priority = ["factual claim", "opinion", "personal anecdote"]
@@ -55,16 +55,15 @@ def extract_claims(page_text, max_claims=20, batch_size=50):
55
  return {"text": s, "label": lbl, "score": round(out["scores"][out["labels"].index(lbl)], 3)}
56
  return None
57
 
58
- # Step 4: Threaded classification
59
  results = []
60
  with ThreadPoolExecutor() as executor:
61
- for r in executor.map(classify_sentence, refined_sentences):
62
  if r:
63
  results.append(r)
64
 
65
- # Step 5: Limit top claims
66
  results = sorted(results, key=lambda x: -len(x["text"]))[:max_claims]
67
-
68
  return results
69
 
70
  # ---------------------------
@@ -103,19 +102,13 @@ def fetch_google_search(claim):
103
  except Exception:
104
  return []
105
 
106
- # ---------------------------
107
- # Dot-split helper for raw text
108
- # ---------------------------
109
- def split_on_dots(text):
110
- return [s.strip() for s in text.split('.') if len(s.strip().split()) > 4]
111
-
112
  # ---------------------------
113
  # Unified Predict Function
114
  # ---------------------------
115
  def predict(user_text=""):
116
  """
117
  Runs both:
118
- 1. Full-text analysis (AI detection on entire input + dot-split fact-check)
119
  2. Claim-extracted analysis (claim split + AI detection + fact-check)
120
  """
121
  if not user_text.strip():
@@ -123,7 +116,7 @@ def predict(user_text=""):
123
 
124
  # --- Full text analysis ---
125
  full_ai_result = detect_ai(user_text)
126
- dot_sentences = split_on_dots(user_text)
127
  full_fact_checking = {s: fetch_google_search(s) for s in dot_sentences}
128
 
129
  # --- Claim-based analysis ---
 
25
  GOOGLE_DAILY_LIMIT = 100
26
 
27
  # ---------------------------
28
+ # Safe Split Helpers
29
+ # ---------------------------
30
+ def safe_split_text(text):
31
+ """
32
+ Split text safely on '.' or ',' or ';'
33
+ but do NOT split when between numbers (e.g., 1.41, 1,200, $1,200).
34
+ """
35
+ pattern = r'(?<!\d)[.](?!\d)|(?<![\d\$]),(?!\d)|;'
36
+ return [s.strip() for s in re.split(pattern, text) if len(s.strip().split()) > 4]
37
+
38
+ # ---------------------------
39
+ # Claim Extraction
40
  # ---------------------------
41
  def extract_claims(page_text, max_claims=20, batch_size=50):
42
  """
43
  Extract top claims from text:
44
+ - Uses safe_split_text for splitting.
45
+ - Classifies each piece into factual claim, opinion, or anecdote.
46
  """
47
+ sentences = safe_split_text(page_text)
 
 
 
 
 
 
 
48
 
49
+ # Step 1: Function to classify a single sentence
 
 
 
 
50
  def classify_sentence(s):
51
  out = claim_classifier(s, claim_labels)
52
  label_priority = ["factual claim", "opinion", "personal anecdote"]
 
55
  return {"text": s, "label": lbl, "score": round(out["scores"][out["labels"].index(lbl)], 3)}
56
  return None
57
 
58
+ # Step 2: Threaded classification
59
  results = []
60
  with ThreadPoolExecutor() as executor:
61
+ for r in executor.map(classify_sentence, sentences):
62
  if r:
63
  results.append(r)
64
 
65
+ # Step 3: Limit top claims
66
  results = sorted(results, key=lambda x: -len(x["text"]))[:max_claims]
 
67
  return results
68
 
69
  # ---------------------------
 
102
  except Exception:
103
  return []
104
 
 
 
 
 
 
 
105
  # ---------------------------
106
  # Unified Predict Function
107
  # ---------------------------
108
  def predict(user_text=""):
109
  """
110
  Runs both:
111
+ 1. Full-text analysis (AI detection on entire input + safe-split fact-check)
112
  2. Claim-extracted analysis (claim split + AI detection + fact-check)
113
  """
114
  if not user_text.strip():
 
116
 
117
  # --- Full text analysis ---
118
  full_ai_result = detect_ai(user_text)
119
+ dot_sentences = safe_split_text(user_text)
120
  full_fact_checking = {s: fetch_google_search(s) for s in dot_sentences}
121
 
122
  # --- Claim-based analysis ---