Spaces:
Sleeping
Sleeping
refactor code
Browse files- AIGVDet/main.py +2 -2
- api_server.py +2 -2
- miragenews/data/encode_predictions.py +13 -13
- miragenews/img/resources.py +1 -1
- miragenews/img/web_utils.py +11 -11
- miragenews/merge_img_text.py +30 -30
- miragenews/test_single_pair.py +6 -6
- miragenews/text_module/config.py +2 -2
- miragenews/text_module/llm_utils.py +1 -1
AIGVDet/main.py
CHANGED
|
@@ -15,10 +15,10 @@ def run_video_to_json(
|
|
| 15 |
optical_root: str = "optical_result"
|
| 16 |
) -> Dict:
|
| 17 |
"""
|
| 18 |
-
|
| 19 |
|
| 20 |
Returns:
|
| 21 |
-
dict
|
| 22 |
"""
|
| 23 |
|
| 24 |
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
| 15 |
optical_root: str = "optical_result"
|
| 16 |
) -> Dict:
|
| 17 |
"""
|
| 18 |
+
Process a single video and write the result to a JSON file.
|
| 19 |
|
| 20 |
Returns:
|
| 21 |
+
result dict (and optionally writes to JSON)
|
| 22 |
"""
|
| 23 |
|
| 24 |
script_dir = os.path.dirname(os.path.abspath(__file__))
|
api_server.py
CHANGED
|
@@ -25,14 +25,14 @@ cred_json = os.getenv("GOOGLE_CREDENTIALS_JSON")
|
|
| 25 |
if ENV == "hf":
|
| 26 |
if cred_json:
|
| 27 |
try:
|
| 28 |
-
# Parse
|
| 29 |
json.loads(cred_json)
|
| 30 |
|
| 31 |
file_path = "google-credentials.json"
|
| 32 |
with open(file_path, "w") as f:
|
| 33 |
f.write(cred_json)
|
| 34 |
|
| 35 |
-
#
|
| 36 |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = file_path
|
| 37 |
|
| 38 |
print("[INFO] Google credentials saved to", file_path)
|
|
|
|
| 25 |
if ENV == "hf":
|
| 26 |
if cred_json:
|
| 27 |
try:
|
| 28 |
+
# Parse to ensure the JSON payload is valid
|
| 29 |
json.loads(cred_json)
|
| 30 |
|
| 31 |
file_path = "google-credentials.json"
|
| 32 |
with open(file_path, "w") as f:
|
| 33 |
f.write(cred_json)
|
| 34 |
|
| 35 |
+
# Reset env so Google auth can auto-detect the credentials
|
| 36 |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = file_path
|
| 37 |
|
| 38 |
print("[INFO] Google credentials saved to", file_path)
|
miragenews/data/encode_predictions.py
CHANGED
|
@@ -111,11 +111,11 @@ def main(mode, model_class, custom=False, img_dirs=None, text_dirs=None, batch_s
|
|
| 111 |
|
| 112 |
if mode == "image":
|
| 113 |
# Load shared processors and models
|
| 114 |
-
#
|
| 115 |
object_processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
|
| 116 |
object_detector = Owlv2ForObjectDetection.from_pretrained(
|
| 117 |
"google/owlv2-base-patch16-ensemble",
|
| 118 |
-
torch_dtype=torch.float16 #
|
| 119 |
).to(device)
|
| 120 |
image_processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
|
| 121 |
image_encoder = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl").vision_model.to(device)
|
|
@@ -176,16 +176,16 @@ def main(mode, model_class, custom=False, img_dirs=None, text_dirs=None, batch_s
|
|
| 176 |
print(f"Predictions for fake images in {read_dir} saved.")
|
| 177 |
|
| 178 |
# ==================================================================
|
| 179 |
-
# ===
|
| 180 |
# ==================================================================
|
| 181 |
elif text_dirs:
|
| 182 |
for read_dir in text_dirs:
|
| 183 |
-
#
|
| 184 |
if model_class != "linear":
|
| 185 |
print(f"Warning: Only 'linear' model class is supported for custom text dirs. Skipping '{model_class}'.")
|
| 186 |
-
continue #
|
| 187 |
|
| 188 |
-
for label in ["real", "fake"]: #
|
| 189 |
text_dir = os.path.join("my_dataset/text", read_dir, label)
|
| 190 |
batch = []
|
| 191 |
predictions = []
|
|
@@ -194,7 +194,7 @@ def main(mode, model_class, custom=False, img_dirs=None, text_dirs=None, batch_s
|
|
| 194 |
print(f"Processing directory: {text_dir}")
|
| 195 |
for text_name in tqdm(sorted(os.listdir(text_dir)), desc=f"Processing {text_dir} with {model_class}"):
|
| 196 |
text_path = os.path.join(text_dir, text_name)
|
| 197 |
-
#
|
| 198 |
try:
|
| 199 |
with open(text_path, 'r', encoding='utf-8') as f:
|
| 200 |
text = f.read()
|
|
@@ -203,25 +203,25 @@ def main(mode, model_class, custom=False, img_dirs=None, text_dirs=None, batch_s
|
|
| 203 |
continue
|
| 204 |
|
| 205 |
batch.append(text)
|
| 206 |
-
#
|
| 207 |
if len(batch) == batch_size:
|
| 208 |
text_encoding = preprocess_texts(batch, clip_model, device)
|
| 209 |
predictions.append(process_txt_linear(model, text_encoding, device))
|
| 210 |
batch = []
|
| 211 |
|
| 212 |
-
#
|
| 213 |
if batch:
|
| 214 |
text_encoding = preprocess_texts(batch, clip_model, device)
|
| 215 |
predictions.append(process_txt_linear(model, text_encoding, device))
|
| 216 |
|
| 217 |
-
#
|
| 218 |
if predictions:
|
| 219 |
save_predictions(torch.cat(predictions), output_dir, mode, model_class, read_dir, label)
|
| 220 |
print(f"Predictions for {label} texts in {read_dir} saved.")
|
| 221 |
else:
|
| 222 |
print(f"Directory not found, skipping: {text_dir}")
|
| 223 |
# ==================================================================
|
| 224 |
-
# ===
|
| 225 |
# ==================================================================
|
| 226 |
|
| 227 |
else:
|
|
@@ -229,12 +229,12 @@ def main(mode, model_class, custom=False, img_dirs=None, text_dirs=None, batch_s
|
|
| 229 |
dataset_name = "anson-huang/mirage-news"
|
| 230 |
available_splits = list(load_dataset(dataset_name).keys())
|
| 231 |
if test_only:
|
| 232 |
-
#
|
| 233 |
available_splits = [s for s in available_splits if s.startswith('test')]
|
| 234 |
|
| 235 |
for split in available_splits:
|
| 236 |
if split not in ['train', 'validation'] and not test_only:
|
| 237 |
-
continue #
|
| 238 |
|
| 239 |
dataset = load_dataset(dataset_name, split=split)
|
| 240 |
for label in ["real", "fake"]:
|
|
|
|
| 111 |
|
| 112 |
if mode == "image":
|
| 113 |
# Load shared processors and models
|
| 114 |
+
# Fix OOM issues for cbm-encoder
|
| 115 |
object_processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
|
| 116 |
object_detector = Owlv2ForObjectDetection.from_pretrained(
|
| 117 |
"google/owlv2-base-patch16-ensemble",
|
| 118 |
+
torch_dtype=torch.float16 # Add float16
|
| 119 |
).to(device)
|
| 120 |
image_processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
|
| 121 |
image_encoder = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl").vision_model.to(device)
|
|
|
|
| 176 |
print(f"Predictions for fake images in {read_dir} saved.")
|
| 177 |
|
| 178 |
# ==================================================================
|
| 179 |
+
# === START NEW BLOCK TO HANDLE CUSTOM TEXT ===
|
| 180 |
# ==================================================================
|
| 181 |
elif text_dirs:
|
| 182 |
for read_dir in text_dirs:
|
| 183 |
+
# This logic only runs for 'linear' because TBM (18-dim) is not feasible for custom text
|
| 184 |
if model_class != "linear":
|
| 185 |
print(f"Warning: Only 'linear' model class is supported for custom text dirs. Skipping '{model_class}'.")
|
| 186 |
+
continue # Skip if the model class is not 'linear'
|
| 187 |
|
| 188 |
+
for label in ["real", "fake"]: # Handle both 'real' and 'fake' folders when present
|
| 189 |
text_dir = os.path.join("my_dataset/text", read_dir, label)
|
| 190 |
batch = []
|
| 191 |
predictions = []
|
|
|
|
| 194 |
print(f"Processing directory: {text_dir}")
|
| 195 |
for text_name in tqdm(sorted(os.listdir(text_dir)), desc=f"Processing {text_dir} with {model_class}"):
|
| 196 |
text_path = os.path.join(text_dir, text_name)
|
| 197 |
+
# Read text file contents
|
| 198 |
try:
|
| 199 |
with open(text_path, 'r', encoding='utf-8') as f:
|
| 200 |
text = f.read()
|
|
|
|
| 203 |
continue
|
| 204 |
|
| 205 |
batch.append(text)
|
| 206 |
+
# Process batch when it reaches capacity
|
| 207 |
if len(batch) == batch_size:
|
| 208 |
text_encoding = preprocess_texts(batch, clip_model, device)
|
| 209 |
predictions.append(process_txt_linear(model, text_encoding, device))
|
| 210 |
batch = []
|
| 211 |
|
| 212 |
+
# Process any remaining batch
|
| 213 |
if batch:
|
| 214 |
text_encoding = preprocess_texts(batch, clip_model, device)
|
| 215 |
predictions.append(process_txt_linear(model, text_encoding, device))
|
| 216 |
|
| 217 |
+
# Save .pt file if predictions exist
|
| 218 |
if predictions:
|
| 219 |
save_predictions(torch.cat(predictions), output_dir, mode, model_class, read_dir, label)
|
| 220 |
print(f"Predictions for {label} texts in {read_dir} saved.")
|
| 221 |
else:
|
| 222 |
print(f"Directory not found, skipping: {text_dir}")
|
| 223 |
# ==================================================================
|
| 224 |
+
# === END OF NEW BLOCK ===
|
| 225 |
# ==================================================================
|
| 226 |
|
| 227 |
else:
|
|
|
|
| 229 |
dataset_name = "anson-huang/mirage-news"
|
| 230 |
available_splits = list(load_dataset(dataset_name).keys())
|
| 231 |
if test_only:
|
| 232 |
+
# Use the first 5 test splits only
|
| 233 |
available_splits = [s for s in available_splits if s.startswith('test')]
|
| 234 |
|
| 235 |
for split in available_splits:
|
| 236 |
if split not in ['train', 'validation'] and not test_only:
|
| 237 |
+
continue # Skip test splits when test_only flag is not set
|
| 238 |
|
| 239 |
dataset = load_dataset(dataset_name, split=split)
|
| 240 |
for label in ["real", "fake"]:
|
miragenews/img/resources.py
CHANGED
|
@@ -4,7 +4,7 @@ from google.cloud import vision
|
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
import os
|
| 6 |
|
| 7 |
-
# Import local modules (
|
| 8 |
|
| 9 |
from .semantic_filter import SemanticFilter
|
| 10 |
from miragenews.models import get_model
|
|
|
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
import os
|
| 6 |
|
| 7 |
+
# Import local modules (keep your existing logic)
|
| 8 |
|
| 9 |
from .semantic_filter import SemanticFilter
|
| 10 |
from miragenews.models import get_model
|
miragenews/img/web_utils.py
CHANGED
|
@@ -98,7 +98,7 @@ async def find_best_url_fast_scan_bs4(
|
|
| 98 |
continue
|
| 99 |
|
| 100 |
if not img_tags:
|
| 101 |
-
print("[Fast Scan]
|
| 102 |
continue
|
| 103 |
|
| 104 |
for img_tag in img_tags:
|
|
@@ -124,18 +124,18 @@ async def find_best_url_fast_scan_bs4(
|
|
| 124 |
best_url = url
|
| 125 |
|
| 126 |
if sim > 0.9:
|
| 127 |
-
print(f"✅ [Fast Scan]
|
| 128 |
return url, sim
|
| 129 |
|
| 130 |
except Exception as e:
|
| 131 |
pass
|
| 132 |
|
| 133 |
if best_url:
|
| 134 |
-
print(f"ℹ️ [Fast Scan]
|
| 135 |
return best_url, max_sim
|
| 136 |
|
| 137 |
if not best_url and urls:
|
| 138 |
-
print(f"ℹ️ [Fast Scan]
|
| 139 |
return urls[0], 0.0
|
| 140 |
|
| 141 |
return None, 0.0
|
|
@@ -150,12 +150,12 @@ async def get_html_context_block_bs4(
|
|
| 150 |
api_key: Optional[str],
|
| 151 |
progress: gr.Progress
|
| 152 |
) -> str:
|
| 153 |
-
print(f"--- [Deep Scan]
|
| 154 |
progress(0.6, desc="Phase 2/2: Deep scan (Fetching HTML block)...")
|
| 155 |
|
| 156 |
html = await scrape_html_with_fallback(url, client, api_key)
|
| 157 |
if not html:
|
| 158 |
-
print("[Deep Scan]
|
| 159 |
return ""
|
| 160 |
|
| 161 |
try:
|
|
@@ -191,7 +191,7 @@ async def get_html_context_block_bs4(
|
|
| 191 |
pass
|
| 192 |
|
| 193 |
if best_tag:
|
| 194 |
-
print(f"[Deep Scan]
|
| 195 |
|
| 196 |
current = best_tag
|
| 197 |
for _ in range(5):
|
|
@@ -200,12 +200,12 @@ async def get_html_context_block_bs4(
|
|
| 200 |
break
|
| 201 |
parent_name = parent.name.lower()
|
| 202 |
if parent_name in ['article', 'section', 'li', 'main']:
|
| 203 |
-
print(f"[Deep Scan]
|
| 204 |
return str(parent)
|
| 205 |
if parent_name == 'div':
|
| 206 |
class_list = parent.get('class', [])
|
| 207 |
if any(cls in ['content', 'post', 'article', 'story-body', 'caption'] for cls in class_list):
|
| 208 |
-
print(f"[Deep Scan]
|
| 209 |
return str(parent)
|
| 210 |
current = parent
|
| 211 |
|
|
@@ -216,9 +216,9 @@ async def get_html_context_block_bs4(
|
|
| 216 |
else:
|
| 217 |
return str(best_tag.parent)
|
| 218 |
else:
|
| 219 |
-
print("[Deep Scan]
|
| 220 |
return ""
|
| 221 |
except Exception as e:
|
| 222 |
-
print(f"❌ [Deep Scan]
|
| 223 |
return ""
|
| 224 |
|
|
|
|
| 98 |
continue
|
| 99 |
|
| 100 |
if not img_tags:
|
| 101 |
+
print("[Fast Scan] No <img> tag found.")
|
| 102 |
continue
|
| 103 |
|
| 104 |
for img_tag in img_tags:
|
|
|
|
| 124 |
best_url = url
|
| 125 |
|
| 126 |
if sim > 0.9:
|
| 127 |
+
print(f"✅ [Fast Scan] FOUND MATCH > 0.9 (Sim: {sim:.4f}) at: {url}")
|
| 128 |
return url, sim
|
| 129 |
|
| 130 |
except Exception as e:
|
| 131 |
pass
|
| 132 |
|
| 133 |
if best_url:
|
| 134 |
+
print(f"ℹ️ [Fast Scan] No similarity > 0.9. Using best-match URL: {best_url} (Sim: {max_sim:.4f})")
|
| 135 |
return best_url, max_sim
|
| 136 |
|
| 137 |
if not best_url and urls:
|
| 138 |
+
print(f"ℹ️ [Fast Scan] No images found. Using the first URL as fallback.")
|
| 139 |
return urls[0], 0.0
|
| 140 |
|
| 141 |
return None, 0.0
|
|
|
|
| 150 |
api_key: Optional[str],
|
| 151 |
progress: gr.Progress
|
| 152 |
) -> str:
|
| 153 |
+
print(f"--- [Deep Scan] Fetching HTML block from: {url} ---")
|
| 154 |
progress(0.6, desc="Phase 2/2: Deep scan (Fetching HTML block)...")
|
| 155 |
|
| 156 |
html = await scrape_html_with_fallback(url, client, api_key)
|
| 157 |
if not html:
|
| 158 |
+
print("[Deep Scan] HTML scrape failed.")
|
| 159 |
return ""
|
| 160 |
|
| 161 |
try:
|
|
|
|
| 191 |
pass
|
| 192 |
|
| 193 |
if best_tag:
|
| 194 |
+
print(f"[Deep Scan] Found closest matching image (Sim: {max_sim:.4f}). Looking for parent block...")
|
| 195 |
|
| 196 |
current = best_tag
|
| 197 |
for _ in range(5):
|
|
|
|
| 200 |
break
|
| 201 |
parent_name = parent.name.lower()
|
| 202 |
if parent_name in ['article', 'section', 'li', 'main']:
|
| 203 |
+
print(f"[Deep Scan] Found semantic block: <{parent_name}>")
|
| 204 |
return str(parent)
|
| 205 |
if parent_name == 'div':
|
| 206 |
class_list = parent.get('class', [])
|
| 207 |
if any(cls in ['content', 'post', 'article', 'story-body', 'caption'] for cls in class_list):
|
| 208 |
+
print(f"[Deep Scan] Found important div block: {class_list}")
|
| 209 |
return str(parent)
|
| 210 |
current = parent
|
| 211 |
|
|
|
|
| 216 |
else:
|
| 217 |
return str(best_tag.parent)
|
| 218 |
else:
|
| 219 |
+
print("[Deep Scan] No matching images found.")
|
| 220 |
return ""
|
| 221 |
except Exception as e:
|
| 222 |
+
print(f"❌ [Deep Scan] Error parsing HTML: {e}")
|
| 223 |
return ""
|
| 224 |
|
miragenews/merge_img_text.py
CHANGED
|
@@ -5,45 +5,45 @@ from img.core import analyze_saved_images
|
|
| 5 |
from text_module.pipeline import verify_text_logic
|
| 6 |
from text_module.TextAnalysisResult import TextAnalysisResult
|
| 7 |
|
| 8 |
-
# --- HELPER:
|
| 9 |
def parse_child_report(report_text):
|
| 10 |
"""
|
| 11 |
-
|
| 12 |
"""
|
| 13 |
data = {
|
| 14 |
"auth": "N/A", "tools": "Unknown", "synth": "N/A", "artifacts": ""
|
| 15 |
}
|
| 16 |
if not report_text: return data
|
| 17 |
|
| 18 |
-
# 1.
|
| 19 |
-
#
|
| 20 |
auth_match = re.search(r"Authenticity Assessment:\s*(.+)", report_text)
|
| 21 |
if auth_match:
|
| 22 |
data["auth"] = auth_match.group(1).strip()
|
| 23 |
|
| 24 |
-
# 2.
|
| 25 |
tools_match = re.search(r"Verification Tools & Methods:\s*(.+)", report_text)
|
| 26 |
if tools_match:
|
| 27 |
data["tools"] = tools_match.group(1).strip()
|
| 28 |
|
| 29 |
-
# 3.
|
| 30 |
synth_match = re.search(r"Synthetic Type \(if applicable\):\s*(.+)", report_text)
|
| 31 |
if synth_match:
|
| 32 |
data["synth"] = synth_match.group(1).strip()
|
| 33 |
|
| 34 |
-
# 4.
|
| 35 |
art_match = re.search(r"Other Artifacts:\s*(.*)", report_text, re.DOTALL)
|
| 36 |
if art_match:
|
| 37 |
data["artifacts"] = art_match.group(1).strip()
|
| 38 |
|
| 39 |
return data
|
| 40 |
|
| 41 |
-
# --- HELPER: CHECK FAKE
|
| 42 |
def is_verdict_fake(assessment_string):
|
| 43 |
if not assessment_string: return False
|
| 44 |
s = assessment_string.lower().strip()
|
| 45 |
|
| 46 |
-
#
|
| 47 |
fake_keywords = ["not real", "fake", "manipulated", "generated", "artificial", "synthetic"]
|
| 48 |
|
| 49 |
for kw in fake_keywords:
|
|
@@ -51,7 +51,7 @@ def is_verdict_fake(assessment_string):
|
|
| 51 |
return True
|
| 52 |
return False
|
| 53 |
|
| 54 |
-
# --- HTML STATUS BAR (
|
| 55 |
def create_status_html(label, status, message):
|
| 56 |
color = "#9ca3af"; percent = 5; icon = "⏳"; bg_pulse = ""; text_color = "#374151"
|
| 57 |
if status == 'processing':
|
|
@@ -75,7 +75,7 @@ def create_status_html(label, status, message):
|
|
| 75 |
"""
|
| 76 |
return html
|
| 77 |
|
| 78 |
-
# --- TASK 1:
|
| 79 |
async def run_image_task(shared_state, image_input):
|
| 80 |
shared_state['img_status'] = 'processing'
|
| 81 |
shared_state['img_msg'] = "Scanning artifacts..."
|
|
@@ -89,10 +89,10 @@ async def run_image_task(shared_state, image_input):
|
|
| 89 |
else:
|
| 90 |
for res in gen: final_json, final_report_md = res
|
| 91 |
|
| 92 |
-
#
|
| 93 |
img_result_obj.set_other_artifacts(final_report_md)
|
| 94 |
|
| 95 |
-
# Parse
|
| 96 |
parsed = parse_child_report(final_report_md)
|
| 97 |
img_result_obj.set_authenticity_assessment(parsed["auth"])
|
| 98 |
|
|
@@ -104,7 +104,7 @@ async def run_image_task(shared_state, image_input):
|
|
| 104 |
img_result_obj.set_authenticity_assessment("Error")
|
| 105 |
return img_result_obj
|
| 106 |
|
| 107 |
-
# --- TASK 2:
|
| 108 |
async def run_text_task(shared_state, text_input):
|
| 109 |
shared_state['txt_status'] = 'processing'
|
| 110 |
shared_state['txt_msg'] = "Verifying logic..."
|
|
@@ -137,7 +137,7 @@ async def verify_multimodal_logic(image_state, text_input):
|
|
| 137 |
if task_img.done() and img_res is None:
|
| 138 |
try:
|
| 139 |
img_res = task_img.result()
|
| 140 |
-
# Check
|
| 141 |
if is_verdict_fake(img_res.get_authenticity_assessment()):
|
| 142 |
if not task_txt.done(): task_txt.cancel(); shared_state['txt_msg'] = "Stopped (Image is Fake)"
|
| 143 |
break
|
|
@@ -146,7 +146,7 @@ async def verify_multimodal_logic(image_state, text_input):
|
|
| 146 |
if task_txt.done() and txt_res is None:
|
| 147 |
try:
|
| 148 |
txt_res = task_txt.result()
|
| 149 |
-
# Check
|
| 150 |
if is_verdict_fake(txt_res.get_authenticity_assessment()):
|
| 151 |
if not task_img.done(): task_img.cancel(); shared_state['img_msg'] = "Stopped (Text is Fake)"
|
| 152 |
break
|
|
@@ -159,17 +159,17 @@ async def verify_multimodal_logic(image_state, text_input):
|
|
| 159 |
if not txt_res: txt_res = TextAnalysisResult(authenticity_assessment="Skipped")
|
| 160 |
|
| 161 |
# =========================================================================
|
| 162 |
-
# LOGIC
|
| 163 |
# =========================================================================
|
| 164 |
|
| 165 |
-
# 1. Parse Image Report
|
| 166 |
img_data_parsed = parse_child_report(img_res.get_other_artifacts())
|
| 167 |
img_auth_line = img_data_parsed["auth"] # VD: "🧑 REAL PHOTO"
|
| 168 |
|
| 169 |
-
# 2.
|
| 170 |
txt_auth_line = txt_res.get_authenticity_assessment() # VD: "REAL (Authentic)"
|
| 171 |
|
| 172 |
-
# 3.
|
| 173 |
img_is_fake = is_verdict_fake(img_auth_line)
|
| 174 |
txt_is_fake = is_verdict_fake(txt_auth_line)
|
| 175 |
|
|
@@ -187,12 +187,12 @@ async def verify_multimodal_logic(image_state, text_input):
|
|
| 187 |
# --- FIELD 3: Synthetic Type ---
|
| 188 |
final_synth_list = []
|
| 189 |
|
| 190 |
-
#
|
| 191 |
if img_is_fake:
|
| 192 |
s_type = img_data_parsed["synth"] if img_data_parsed["synth"] != "N/A" else "Manipulated Image"
|
| 193 |
final_synth_list.append(f"**Image:** {s_type}")
|
| 194 |
|
| 195 |
-
#
|
| 196 |
if txt_is_fake:
|
| 197 |
s_type = txt_res.get_synthetic_type()
|
| 198 |
if not s_type or s_type == "N/A": s_type = "Generated Content"
|
|
@@ -200,36 +200,36 @@ async def verify_multimodal_logic(image_state, text_input):
|
|
| 200 |
|
| 201 |
final_synth_str = "\n".join(final_synth_list) if final_synth_list else "N/A"
|
| 202 |
|
| 203 |
-
# --- FIELD 4: Other Artifacts (
|
| 204 |
final_artifacts_str = ""
|
| 205 |
|
| 206 |
-
# Case:
|
| 207 |
if img_is_fake and txt_is_fake:
|
| 208 |
final_artifacts_str = f"**[Image Evidence]**\n{img_data_parsed['artifacts']}\n\n**[Text Evidence]**\n{txt_res.get_other_artifacts()}"
|
| 209 |
|
| 210 |
-
# Case:
|
| 211 |
elif img_is_fake:
|
| 212 |
final_artifacts_str = f"{img_data_parsed['artifacts']}"
|
| 213 |
|
| 214 |
-
# Case:
|
| 215 |
elif txt_is_fake:
|
| 216 |
final_artifacts_str = f"{txt_res.get_other_artifacts()}"
|
| 217 |
|
| 218 |
-
# Case:
|
| 219 |
else:
|
| 220 |
final_artifacts_str = "Both image and text are verified as authentic by our multi-modal pipeline."
|
| 221 |
|
| 222 |
-
# Check source
|
| 223 |
img_src = img_data_parsed.get('artifacts', '').strip()
|
| 224 |
if img_src and img_src != "N/A" and "No details" not in img_src:
|
| 225 |
final_artifacts_str += f"\n\n**For Image:** {img_src}"
|
| 226 |
|
| 227 |
-
# Check source
|
| 228 |
txt_src = txt_res.get_other_artifacts().strip()
|
| 229 |
if txt_src and txt_src != "N/A":
|
| 230 |
final_artifacts_str += f"\n\n**For Text:** {txt_src}"
|
| 231 |
|
| 232 |
-
#
|
| 233 |
final_report_md = f"""
|
| 234 |
### 📋 Final Verification Report
|
| 235 |
|
|
|
|
| 5 |
from text_module.pipeline import verify_text_logic
|
| 6 |
from text_module.TextAnalysisResult import TextAnalysisResult
|
| 7 |
|
| 8 |
+
# --- HELPER: PARSE REPORT ---
|
| 9 |
def parse_child_report(report_text):
|
| 10 |
"""
|
| 11 |
+
Use regex to extract each specific line value.
|
| 12 |
"""
|
| 13 |
data = {
|
| 14 |
"auth": "N/A", "tools": "Unknown", "synth": "N/A", "artifacts": ""
|
| 15 |
}
|
| 16 |
if not report_text: return data
|
| 17 |
|
| 18 |
+
# 1. Extract Authenticity Assessment (most important)
|
| 19 |
+
# This regex only grabs content on the same line after the colon
|
| 20 |
auth_match = re.search(r"Authenticity Assessment:\s*(.+)", report_text)
|
| 21 |
if auth_match:
|
| 22 |
data["auth"] = auth_match.group(1).strip()
|
| 23 |
|
| 24 |
+
# 2. Extract Tools
|
| 25 |
tools_match = re.search(r"Verification Tools & Methods:\s*(.+)", report_text)
|
| 26 |
if tools_match:
|
| 27 |
data["tools"] = tools_match.group(1).strip()
|
| 28 |
|
| 29 |
+
# 3. Extract Synthetic Type
|
| 30 |
synth_match = re.search(r"Synthetic Type \(if applicable\):\s*(.+)", report_text)
|
| 31 |
if synth_match:
|
| 32 |
data["synth"] = synth_match.group(1).strip()
|
| 33 |
|
| 34 |
+
# 4. Extract Artifacts (from that line through the end)
|
| 35 |
art_match = re.search(r"Other Artifacts:\s*(.*)", report_text, re.DOTALL)
|
| 36 |
if art_match:
|
| 37 |
data["artifacts"] = art_match.group(1).strip()
|
| 38 |
|
| 39 |
return data
|
| 40 |
|
| 41 |
+
# --- HELPER: CHECK FAKE USING ONLY ASSESSMENT LINE ---
|
| 42 |
def is_verdict_fake(assessment_string):
|
| 43 |
if not assessment_string: return False
|
| 44 |
s = assessment_string.lower().strip()
|
| 45 |
|
| 46 |
+
# Keywords that indicate a fake verdict
|
| 47 |
fake_keywords = ["not real", "fake", "manipulated", "generated", "artificial", "synthetic"]
|
| 48 |
|
| 49 |
for kw in fake_keywords:
|
|
|
|
| 51 |
return True
|
| 52 |
return False
|
| 53 |
|
| 54 |
+
# --- HTML STATUS BAR (KEEP LOGIC) ---
|
| 55 |
def create_status_html(label, status, message):
|
| 56 |
color = "#9ca3af"; percent = 5; icon = "⏳"; bg_pulse = ""; text_color = "#374151"
|
| 57 |
if status == 'processing':
|
|
|
|
| 75 |
"""
|
| 76 |
return html
|
| 77 |
|
| 78 |
+
# --- TASK 1: PROCESS IMAGES ---
|
| 79 |
async def run_image_task(shared_state, image_input):
|
| 80 |
shared_state['img_status'] = 'processing'
|
| 81 |
shared_state['img_msg'] = "Scanning artifacts..."
|
|
|
|
| 89 |
else:
|
| 90 |
for res in gen: final_json, final_report_md = res
|
| 91 |
|
| 92 |
+
# Save the full report string into artifacts
|
| 93 |
img_result_obj.set_other_artifacts(final_report_md)
|
| 94 |
|
| 95 |
+
# Parse the Auth line to update status (used for possible short-circuit)
|
| 96 |
parsed = parse_child_report(final_report_md)
|
| 97 |
img_result_obj.set_authenticity_assessment(parsed["auth"])
|
| 98 |
|
|
|
|
| 104 |
img_result_obj.set_authenticity_assessment("Error")
|
| 105 |
return img_result_obj
|
| 106 |
|
| 107 |
+
# --- TASK 2: PROCESS TEXT ---
|
| 108 |
async def run_text_task(shared_state, text_input):
|
| 109 |
shared_state['txt_status'] = 'processing'
|
| 110 |
shared_state['txt_msg'] = "Verifying logic..."
|
|
|
|
| 137 |
if task_img.done() and img_res is None:
|
| 138 |
try:
|
| 139 |
img_res = task_img.result()
|
| 140 |
+
# Check fake verdict using only the Assessment line (short-circuit)
|
| 141 |
if is_verdict_fake(img_res.get_authenticity_assessment()):
|
| 142 |
if not task_txt.done(): task_txt.cancel(); shared_state['txt_msg'] = "Stopped (Image is Fake)"
|
| 143 |
break
|
|
|
|
| 146 |
if task_txt.done() and txt_res is None:
|
| 147 |
try:
|
| 148 |
txt_res = task_txt.result()
|
| 149 |
+
# Check fake verdict using only the Assessment line
|
| 150 |
if is_verdict_fake(txt_res.get_authenticity_assessment()):
|
| 151 |
if not task_img.done(): task_img.cancel(); shared_state['img_msg'] = "Stopped (Text is Fake)"
|
| 152 |
break
|
|
|
|
| 159 |
if not txt_res: txt_res = TextAnalysisResult(authenticity_assessment="Skipped")
|
| 160 |
|
| 161 |
# =========================================================================
|
| 162 |
+
# MERGE LOGIC: BASED ONLY ON THE ASSESSMENT LINE
|
| 163 |
# =========================================================================
|
| 164 |
|
| 165 |
+
# 1. Parse Image Report to extract a clean "Authenticity Assessment" line
|
| 166 |
img_data_parsed = parse_child_report(img_res.get_other_artifacts())
|
| 167 |
img_auth_line = img_data_parsed["auth"] # VD: "🧑 REAL PHOTO"
|
| 168 |
|
| 169 |
+
# 2. Get the Assessment line for Text
|
| 170 |
txt_auth_line = txt_res.get_authenticity_assessment() # VD: "REAL (Authentic)"
|
| 171 |
|
| 172 |
+
# 3. Determine fake/real based on those two lines
|
| 173 |
img_is_fake = is_verdict_fake(img_auth_line)
|
| 174 |
txt_is_fake = is_verdict_fake(txt_auth_line)
|
| 175 |
|
|
|
|
| 187 |
# --- FIELD 3: Synthetic Type ---
|
| 188 |
final_synth_list = []
|
| 189 |
|
| 190 |
+
# Only pull Synthetic Type from Image module if Image is deemed Fake
|
| 191 |
if img_is_fake:
|
| 192 |
s_type = img_data_parsed["synth"] if img_data_parsed["synth"] != "N/A" else "Manipulated Image"
|
| 193 |
final_synth_list.append(f"**Image:** {s_type}")
|
| 194 |
|
| 195 |
+
# Only pull Synthetic Type from Text module if Text is deemed Fake
|
| 196 |
if txt_is_fake:
|
| 197 |
s_type = txt_res.get_synthetic_type()
|
| 198 |
if not s_type or s_type == "N/A": s_type = "Generated Content"
|
|
|
|
| 200 |
|
| 201 |
final_synth_str = "\n".join(final_synth_list) if final_synth_list else "N/A"
|
| 202 |
|
| 203 |
+
# --- FIELD 4: Other Artifacts (Display source/artifacts logic) ---
|
| 204 |
final_artifacts_str = ""
|
| 205 |
|
| 206 |
+
# Case: both are Fake -> show both
|
| 207 |
if img_is_fake and txt_is_fake:
|
| 208 |
final_artifacts_str = f"**[Image Evidence]**\n{img_data_parsed['artifacts']}\n\n**[Text Evidence]**\n{txt_res.get_other_artifacts()}"
|
| 209 |
|
| 210 |
+
# Case: only Image is Fake -> show image evidence
|
| 211 |
elif img_is_fake:
|
| 212 |
final_artifacts_str = f"{img_data_parsed['artifacts']}"
|
| 213 |
|
| 214 |
+
# Case: only Text is Fake -> show text evidence
|
| 215 |
elif txt_is_fake:
|
| 216 |
final_artifacts_str = f"{txt_res.get_other_artifacts()}"
|
| 217 |
|
| 218 |
+
# Case: both are REAL -> show source if available
|
| 219 |
else:
|
| 220 |
final_artifacts_str = "Both image and text are verified as authentic by our multi-modal pipeline."
|
| 221 |
|
| 222 |
+
# Check image source (non-empty and not N/A)
|
| 223 |
img_src = img_data_parsed.get('artifacts', '').strip()
|
| 224 |
if img_src and img_src != "N/A" and "No details" not in img_src:
|
| 225 |
final_artifacts_str += f"\n\n**For Image:** {img_src}"
|
| 226 |
|
| 227 |
+
# Check text source
|
| 228 |
txt_src = txt_res.get_other_artifacts().strip()
|
| 229 |
if txt_src and txt_src != "N/A":
|
| 230 |
final_artifacts_str += f"\n\n**For Text:** {txt_src}"
|
| 231 |
|
| 232 |
+
# BUILD FINAL MARKDOWN
|
| 233 |
final_report_md = f"""
|
| 234 |
### 📋 Final Verification Report
|
| 235 |
|
miragenews/test_single_pair.py
CHANGED
|
@@ -107,19 +107,19 @@ if __name__ == "__main__":
|
|
| 107 |
else:
|
| 108 |
print(f"\nFailed to process {input_pt_path_single}.")
|
| 109 |
|
| 110 |
-
print("\n" + "="*50 + "\n") #
|
| 111 |
|
| 112 |
-
# ---
|
| 113 |
pt_files_to_check = [
|
| 114 |
-
"encodings/predictions/image/merged/my_single_image_dir/real.pt", #
|
| 115 |
-
# "encodings/predictions/image/merged/another_dir/fake_image.pt", #
|
| 116 |
-
"path/to/nonexistent.pt" #
|
| 117 |
]
|
| 118 |
print("\n--- Processing multiple files ---")
|
| 119 |
results = {}
|
| 120 |
for file_path in pt_files_to_check:
|
| 121 |
prob_fake, label = predict_authenticity_from_pt(file_path, mirage_img, device)
|
| 122 |
-
results[file_path] = (prob_fake, label) #
|
| 123 |
|
| 124 |
print("\n--- Summary ---")
|
| 125 |
for file, (prob_fake, label) in results.items():
|
|
|
|
| 107 |
else:
|
| 108 |
print(f"\nFailed to process {input_pt_path_single}.")
|
| 109 |
|
| 110 |
+
print("\n" + "="*50 + "\n") # Add a divider line
|
| 111 |
|
| 112 |
+
# --- EXAMPLE: PROCESS MULTIPLE FILES ---
|
| 113 |
pt_files_to_check = [
|
| 114 |
+
"encodings/predictions/image/merged/my_single_image_dir/real.pt", # Replace with the real file path
|
| 115 |
+
# "encodings/predictions/image/merged/another_dir/fake_image.pt", # THIS LINE WAS REMOVED
|
| 116 |
+
"path/to/nonexistent.pt" # Example of a missing file
|
| 117 |
]
|
| 118 |
print("\n--- Processing multiple files ---")
|
| 119 |
results = {}
|
| 120 |
for file_path in pt_files_to_check:
|
| 121 |
prob_fake, label = predict_authenticity_from_pt(file_path, mirage_img, device)
|
| 122 |
+
results[file_path] = (prob_fake, label) # Store results in a dictionary
|
| 123 |
|
| 124 |
print("\n--- Summary ---")
|
| 125 |
for file, (prob_fake, label) in results.items():
|
miragenews/text_module/config.py
CHANGED
|
@@ -5,9 +5,9 @@ from dotenv import load_dotenv
|
|
| 5 |
load_dotenv()
|
| 6 |
|
| 7 |
# API Keys
|
| 8 |
-
GOOGLE_API_KEY = os.getenv("GOOGLE_CSE_CX") #
|
| 9 |
GOOGLE_SAFE_BROWSING_API_KEY = os.getenv("GOOGLE_SAFE_BROWSING_API_KEY")
|
| 10 |
-
GOOGLE_CX = os.getenv("GOOGLE_CSE_CX") #
|
| 11 |
SCRAPINGDOG_API_KEY = os.getenv("SCRAPINGDOG_API_KEY")
|
| 12 |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 13 |
|
|
|
|
| 5 |
load_dotenv()
|
| 6 |
|
| 7 |
# API Keys
|
| 8 |
+
GOOGLE_API_KEY = os.getenv("GOOGLE_CSE_CX") # Used for Gemini (per your previous code)
|
| 9 |
GOOGLE_SAFE_BROWSING_API_KEY = os.getenv("GOOGLE_SAFE_BROWSING_API_KEY")
|
| 10 |
+
GOOGLE_CX = os.getenv("GOOGLE_CSE_CX") # Used for search
|
| 11 |
SCRAPINGDOG_API_KEY = os.getenv("SCRAPINGDOG_API_KEY")
|
| 12 |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 13 |
|
miragenews/text_module/llm_utils.py
CHANGED
|
@@ -61,7 +61,7 @@ def ask_llm_to_rewrite(text_content):
|
|
| 61 |
max_output_tokens=MAX_TOKENS
|
| 62 |
)
|
| 63 |
|
| 64 |
-
# 4.
|
| 65 |
response = flash_model.generate_content(
|
| 66 |
full_prompt,
|
| 67 |
generation_config=config
|
|
|
|
| 61 |
max_output_tokens=MAX_TOKENS
|
| 62 |
)
|
| 63 |
|
| 64 |
+
# 4. Call generate_content
|
| 65 |
response = flash_model.generate_content(
|
| 66 |
full_prompt,
|
| 67 |
generation_config=config
|