Update app.py
Browse files
app.py
CHANGED
|
@@ -29,22 +29,19 @@ GOOGLE_DAILY_LIMIT = 100
|
|
| 29 |
# ---------------------------
|
| 30 |
def extract_claims(page_text, max_claims=20, batch_size=50):
|
| 31 |
"""
|
| 32 |
-
Extract top claims from
|
| 33 |
- Split on '.' first, then split on ',' and ';' but skip numeric/money commas.
|
| 34 |
- Use zero-shot classification to get factual claim, opinion, or personal anecdote.
|
| 35 |
-
- Threaded processing for efficiency.
|
| 36 |
"""
|
| 37 |
# Step 1: Split text on '.'
|
| 38 |
sentences = [s.strip() for s in page_text.split('.') if len(s.strip().split()) > 4]
|
| 39 |
|
| 40 |
# Step 2: Function to safely split a sentence on ',' and ';'
|
| 41 |
def safe_split(s):
|
| 42 |
-
|
| 43 |
-
pattern = r'(?<![\d\$]),|;' # split on comma not preceded by digit or $ or on semicolon
|
| 44 |
chunks = re.split(pattern, s)
|
| 45 |
return [c.strip() for c in chunks if len(c.split()) > 4]
|
| 46 |
|
| 47 |
-
# Apply safe splitting
|
| 48 |
refined_sentences = []
|
| 49 |
for s in sentences:
|
| 50 |
refined_sentences.extend(safe_split(s))
|
|
@@ -52,7 +49,6 @@ def extract_claims(page_text, max_claims=20, batch_size=50):
|
|
| 52 |
# Step 3: Function to classify a single sentence
|
| 53 |
def classify_sentence(s):
|
| 54 |
out = claim_classifier(s, claim_labels)
|
| 55 |
-
# Pick the most important label (factual > opinion > personal anecdote)
|
| 56 |
label_priority = ["factual claim", "opinion", "personal anecdote"]
|
| 57 |
for lbl in label_priority:
|
| 58 |
if lbl in out["labels"]:
|
|
@@ -107,38 +103,58 @@ def fetch_google_search(claim):
|
|
| 107 |
except Exception:
|
| 108 |
return []
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
# ---------------------------
|
| 111 |
# Unified Predict Function
|
| 112 |
# ---------------------------
|
| 113 |
-
def predict(
|
| 114 |
"""
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
4. Store evidence directly inside fact_checking (claim → list of 3 summaries)
|
| 119 |
"""
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
-
#
|
|
|
|
|
|
|
|
|
|
| 125 |
fact_checking = {c["text"]: fetch_google_search(c["text"]) for c in claims_data}
|
| 126 |
|
| 127 |
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
"claims": claims_data,
|
| 129 |
-
"
|
|
|
|
| 130 |
"google_quota_used": google_quota["count"],
|
| 131 |
-
"google_quota_reset": str(datetime.datetime.combine(
|
| 132 |
-
|
|
|
|
|
|
|
| 133 |
}
|
| 134 |
|
| 135 |
# ---------------------------
|
| 136 |
# Gradio UI
|
| 137 |
# ---------------------------
|
| 138 |
with gr.Blocks() as demo:
|
| 139 |
-
gr.Markdown("## EduShield AI Backend -
|
| 140 |
|
| 141 |
-
page_text_input = gr.Textbox(label="
|
| 142 |
predict_btn = gr.Button("Run Predict")
|
| 143 |
output_json = gr.JSON(label="Predict Results")
|
| 144 |
|
|
|
|
| 29 |
# ---------------------------
|
| 30 |
def extract_claims(page_text, max_claims=20, batch_size=50):
|
| 31 |
"""
|
| 32 |
+
Extract top claims from text:
|
| 33 |
- Split on '.' first, then split on ',' and ';' but skip numeric/money commas.
|
| 34 |
- Use zero-shot classification to get factual claim, opinion, or personal anecdote.
|
|
|
|
| 35 |
"""
|
| 36 |
# Step 1: Split text on '.'
|
| 37 |
sentences = [s.strip() for s in page_text.split('.') if len(s.strip().split()) > 4]
|
| 38 |
|
| 39 |
# Step 2: Function to safely split a sentence on ',' and ';'
|
| 40 |
def safe_split(s):
|
| 41 |
+
pattern = r'(?<![\d\$]),|;' # avoid commas in numbers like 1,000
|
|
|
|
| 42 |
chunks = re.split(pattern, s)
|
| 43 |
return [c.strip() for c in chunks if len(c.split()) > 4]
|
| 44 |
|
|
|
|
| 45 |
refined_sentences = []
|
| 46 |
for s in sentences:
|
| 47 |
refined_sentences.extend(safe_split(s))
|
|
|
|
| 49 |
# Step 3: Function to classify a single sentence
|
| 50 |
def classify_sentence(s):
|
| 51 |
out = claim_classifier(s, claim_labels)
|
|
|
|
| 52 |
label_priority = ["factual claim", "opinion", "personal anecdote"]
|
| 53 |
for lbl in label_priority:
|
| 54 |
if lbl in out["labels"]:
|
|
|
|
| 103 |
except Exception:
|
| 104 |
return []
|
| 105 |
|
| 106 |
+
# ---------------------------
|
| 107 |
+
# Dot-split helper for raw text
|
| 108 |
+
# ---------------------------
|
| 109 |
+
def split_on_dots(text):
|
| 110 |
+
return [s.strip() for s in text.split('.') if len(s.strip().split()) > 4]
|
| 111 |
+
|
| 112 |
# ---------------------------
|
| 113 |
# Unified Predict Function
|
| 114 |
# ---------------------------
|
| 115 |
+
def predict(user_text=""):
|
| 116 |
"""
|
| 117 |
+
Runs both:
|
| 118 |
+
1. Full-text analysis (AI detection on entire input + dot-split fact-check)
|
| 119 |
+
2. Claim-extracted analysis (claim split + AI detection + fact-check)
|
|
|
|
| 120 |
"""
|
| 121 |
+
if not user_text.strip():
|
| 122 |
+
return {"error": "No text provided."}
|
| 123 |
+
|
| 124 |
+
# --- Full text analysis ---
|
| 125 |
+
full_ai_result = detect_ai(user_text)
|
| 126 |
+
dot_sentences = split_on_dots(user_text)
|
| 127 |
+
full_fact_checking = {s: fetch_google_search(s) for s in dot_sentences}
|
| 128 |
|
| 129 |
+
# --- Claim-based analysis ---
|
| 130 |
+
claims_data = extract_claims(user_text)
|
| 131 |
+
claims_texts = [c["text"] for c in claims_data]
|
| 132 |
+
claims_ai_results = detect_ai(claims_texts) if claims_texts else []
|
| 133 |
fact_checking = {c["text"]: fetch_google_search(c["text"]) for c in claims_data}
|
| 134 |
|
| 135 |
return {
|
| 136 |
+
"full_text": {
|
| 137 |
+
"input": user_text,
|
| 138 |
+
"ai_detection": full_ai_result,
|
| 139 |
+
"fact_checking": full_fact_checking
|
| 140 |
+
},
|
| 141 |
"claims": claims_data,
|
| 142 |
+
"claims_ai_detection": claims_ai_results,
|
| 143 |
+
"claims_fact_checking": fact_checking,
|
| 144 |
"google_quota_used": google_quota["count"],
|
| 145 |
+
"google_quota_reset": str(datetime.datetime.combine(
|
| 146 |
+
google_quota["date"] + datetime.timedelta(days=1),
|
| 147 |
+
datetime.time.min
|
| 148 |
+
))
|
| 149 |
}
|
| 150 |
|
| 151 |
# ---------------------------
|
| 152 |
# Gradio UI
|
| 153 |
# ---------------------------
|
| 154 |
with gr.Blocks() as demo:
|
| 155 |
+
gr.Markdown("## EduShield AI Backend - Dual Mode (Full-text + Claims)")
|
| 156 |
|
| 157 |
+
page_text_input = gr.Textbox(label="Input Text", lines=10, placeholder="Paste text here...")
|
| 158 |
predict_btn = gr.Button("Run Predict")
|
| 159 |
output_json = gr.JSON(label="Predict Results")
|
| 160 |
|