Spaces:

tokyotechlab
/

AIDetect

Sleeping

App Files Files Community

Lucii1 commited on Dec 26, 2025

Commit

ecccf5c

1 Parent(s): 11d339f

refactor code

Browse files

Files changed (9) hide show

AIGVDet/main.py +2 -2
api_server.py +2 -2
miragenews/data/encode_predictions.py +13 -13
miragenews/img/resources.py +1 -1
miragenews/img/web_utils.py +11 -11
miragenews/merge_img_text.py +30 -30
miragenews/test_single_pair.py +6 -6
miragenews/text_module/config.py +2 -2
miragenews/text_module/llm_utils.py +1 -1

AIGVDet/main.py CHANGED Viewed

@@ -15,10 +15,10 @@ def run_video_to_json(
     optical_root: str = "optical_result"
 ) -> Dict:
     """
-    Xử lý 1 video và ghi kết quả ra file JSON.
     Returns:
-        dict kết quả (đồng thời ghi ra JSON)
     """
     script_dir = os.path.dirname(os.path.abspath(__file__))

     optical_root: str = "optical_result"
 ) -> Dict:
     """
+    Process a single video and write the result to a JSON file.
     Returns:
+        result dict (and optionally writes to JSON)
     """
     script_dir = os.path.dirname(os.path.abspath(__file__))

api_server.py CHANGED Viewed

@@ -25,14 +25,14 @@ cred_json = os.getenv("GOOGLE_CREDENTIALS_JSON")
 if ENV == "hf":
     if cred_json:
         try:
-            # Parse để đảm bảo JSON hợp lệ
             json.loads(cred_json)
             file_path = "google-credentials.json"
             with open(file_path, "w") as f:
                 f.write(cred_json)
-            # Set lại env để google auth tự nhận
             os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = file_path
             print("[INFO] Google credentials saved to", file_path)

 if ENV == "hf":
     if cred_json:
         try:
+            # Parse to ensure the JSON payload is valid
             json.loads(cred_json)
             file_path = "google-credentials.json"
             with open(file_path, "w") as f:
                 f.write(cred_json)
+            # Reset env so Google auth can auto-detect the credentials
             os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = file_path
             print("[INFO] Google credentials saved to", file_path)

miragenews/data/encode_predictions.py CHANGED Viewed

@@ -111,11 +111,11 @@ def main(mode, model_class, custom=False, img_dirs=None, text_dirs=None, batch_s
     if mode == "image":
         # Load shared processors and models
-        # Sửa lỗi OOM cho cbm-encoder
         object_processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
         object_detector = Owlv2ForObjectDetection.from_pretrained(
             "google/owlv2-base-patch16-ensemble",
-            torch_dtype=torch.float16 # Thêm float16
         ).to(device)
         image_processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
         image_encoder = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl").vision_model.to(device)
@@ -176,16 +176,16 @@ def main(mode, model_class, custom=False, img_dirs=None, text_dirs=None, batch_s
                         print(f"Predictions for fake images in {read_dir} saved.")
         # ==================================================================
-        # === BẮT ĐẦU KHỐI CODE MỚI ĐỂ XỬ LÝ TEXT TÙY CHỈNH ===
         # ==================================================================
         elif text_dirs:
             for read_dir in text_dirs:
-                # Logic này chỉ chạy cho 'linear' vì TBM (18-dim) không khả thi cho custom text
                 if model_class != "linear":
                     print(f"Warning: Only 'linear' model class is supported for custom text dirs. Skipping '{model_class}'.")
-                    continue # Bỏ qua nếu model class không phải 'linear'
-                for label in ["real", "fake"]: # Xử lý cả 'real' và 'fake' nếu thư mục tồn tại
                     text_dir = os.path.join("my_dataset/text", read_dir, label)
                     batch = []
                     predictions = []
@@ -194,7 +194,7 @@ def main(mode, model_class, custom=False, img_dirs=None, text_dirs=None, batch_s
                         print(f"Processing directory: {text_dir}")
                         for text_name in tqdm(sorted(os.listdir(text_dir)), desc=f"Processing {text_dir} with {model_class}"):
                             text_path = os.path.join(text_dir, text_name)
-                            # Đọc nội dung file text
                             try:
                                 with open(text_path, 'r', encoding='utf-8') as f:
                                     text = f.read()
@@ -203,25 +203,25 @@ def main(mode, model_class, custom=False, img_dirs=None, text_dirs=None, batch_s
                                 continue
                             batch.append(text)
-                            # Xử lý batch khi đầy
                             if len(batch) == batch_size:
                                 text_encoding = preprocess_texts(batch, clip_model, device)
                                 predictions.append(process_txt_linear(model, text_encoding, device))
                                 batch = []
-                        # Xử lý batch còn sót lại
                         if batch:
                             text_encoding = preprocess_texts(batch, clip_model, device)
                             predictions.append(process_txt_linear(model, text_encoding, device))
-                        # Lưu file .pt nếu có dự đoán
                         if predictions:
                             save_predictions(torch.cat(predictions), output_dir, mode, model_class, read_dir, label)
                             print(f"Predictions for {label} texts in {read_dir} saved.")
                     else:
                         print(f"Directory not found, skipping: {text_dir}")
         # ==================================================================
-        # === KẾT THÚC KHỐI CODE MỚI ===
         # ==================================================================
     else:
@@ -229,12 +229,12 @@ def main(mode, model_class, custom=False, img_dirs=None, text_dirs=None, batch_s
         dataset_name = "anson-huang/mirage-news"
         available_splits = list(load_dataset(dataset_name).keys())
         if test_only:
-            # Lấy 5 split test đầu tiên
             available_splits = [s for s in available_splits if s.startswith('test')]
         for split in available_splits:
             if split not in ['train', 'validation'] and not test_only:
-                 continue # Bỏ qua các split test nếu không có cờ test_only
             dataset = load_dataset(dataset_name, split=split)
             for label in ["real", "fake"]:

     if mode == "image":
         # Load shared processors and models
+        # Fix OOM issues for cbm-encoder
         object_processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
         object_detector = Owlv2ForObjectDetection.from_pretrained(
             "google/owlv2-base-patch16-ensemble",
+            torch_dtype=torch.float16 # Add float16
         ).to(device)
         image_processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
         image_encoder = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl").vision_model.to(device)
                         print(f"Predictions for fake images in {read_dir} saved.")
         # ==================================================================
+        # === START NEW BLOCK TO HANDLE CUSTOM TEXT ===
         # ==================================================================
         elif text_dirs:
             for read_dir in text_dirs:
+                # This logic only runs for 'linear' because TBM (18-dim) is not feasible for custom text
                 if model_class != "linear":
                     print(f"Warning: Only 'linear' model class is supported for custom text dirs. Skipping '{model_class}'.")
+                    continue # Skip if the model class is not 'linear'
+                for label in ["real", "fake"]: # Handle both 'real' and 'fake' folders when present
                     text_dir = os.path.join("my_dataset/text", read_dir, label)
                     batch = []
                     predictions = []
                         print(f"Processing directory: {text_dir}")
                         for text_name in tqdm(sorted(os.listdir(text_dir)), desc=f"Processing {text_dir} with {model_class}"):
                             text_path = os.path.join(text_dir, text_name)
+                            # Read text file contents
                             try:
                                 with open(text_path, 'r', encoding='utf-8') as f:
                                     text = f.read()
                                 continue
                             batch.append(text)
+                            # Process batch when it reaches capacity
                             if len(batch) == batch_size:
                                 text_encoding = preprocess_texts(batch, clip_model, device)
                                 predictions.append(process_txt_linear(model, text_encoding, device))
                                 batch = []
+                        # Process any remaining batch
                         if batch:
                             text_encoding = preprocess_texts(batch, clip_model, device)
                             predictions.append(process_txt_linear(model, text_encoding, device))
+                        # Save .pt file if predictions exist
                         if predictions:
                             save_predictions(torch.cat(predictions), output_dir, mode, model_class, read_dir, label)
                             print(f"Predictions for {label} texts in {read_dir} saved.")
                     else:
                         print(f"Directory not found, skipping: {text_dir}")
         # ==================================================================
+        # === END OF NEW BLOCK ===
         # ==================================================================
     else:
         dataset_name = "anson-huang/mirage-news"
         available_splits = list(load_dataset(dataset_name).keys())
         if test_only:
+            # Use the first 5 test splits only
             available_splits = [s for s in available_splits if s.startswith('test')]
         for split in available_splits:
             if split not in ['train', 'validation'] and not test_only:
+                 continue # Skip test splits when test_only flag is not set
             dataset = load_dataset(dataset_name, split=split)
             for label in ["real", "fake"]:

miragenews/img/resources.py CHANGED Viewed

@@ -4,7 +4,7 @@ from google.cloud import vision
 from sentence_transformers import SentenceTransformer
 import os
-# Import local modules (giữ nguyên logic cũ của bạn)
 from .semantic_filter import SemanticFilter
 from miragenews.models import get_model

 from sentence_transformers import SentenceTransformer
 import os
+# Import local modules (keep your existing logic)
 from .semantic_filter import SemanticFilter
 from miragenews.models import get_model

miragenews/img/web_utils.py CHANGED Viewed

@@ -98,7 +98,7 @@ async def find_best_url_fast_scan_bs4(
             continue
         if not img_tags:
-            print("[Fast Scan] Không tìm thấy thẻ <img>.")
             continue
         for img_tag in img_tags:
@@ -124,18 +124,18 @@ async def find_best_url_fast_scan_bs4(
                     best_url = url
                 if sim > 0.9:
-                    print(f"✅ [Fast Scan] TÌM THẤY KHỚP > 0.9 (Sim: {sim:.4f}) tại: {url}")
                     return url, sim
             except Exception as e:
                 pass
     if best_url:
-        print(f"ℹ️ [Fast Scan] Không tìm thấy > 0.9. Chọn URL khớp nhất: {best_url} (Sim: {max_sim:.4f})")
         return best_url, max_sim
     if not best_url and urls:
-        print(f"ℹ️ [Fast Scan] Không tìm thấy ảnh nào. Chọn URL đầu tiên làm dự phòng.")
         return urls[0], 0.0
     return None, 0.0
@@ -150,12 +150,12 @@ async def get_html_context_block_bs4(
     api_key: Optional[str],
     progress: gr.Progress
 ) -> str:
-    print(f"--- [Deep Scan] Lấy khối HTML từ: {url} ---")
     progress(0.6, desc="Phase 2/2: Deep scan (Fetching HTML block)...")
     html = await scrape_html_with_fallback(url, client, api_key)
     if not html:
-        print("[Deep Scan] Scrape HTML thất bại.")
         return ""
     try:
@@ -191,7 +191,7 @@ async def get_html_context_block_bs4(
                 pass
         if best_tag:
-            print(f"[Deep Scan] Tìm thấy ảnh khớp nhất (Sim: {max_sim:.4f}). Đang tìm khối cha...")
             current = best_tag
             for _ in range(5):
@@ -200,12 +200,12 @@ async def get_html_context_block_bs4(
                     break
                 parent_name = parent.name.lower()
                 if parent_name in ['article', 'section', 'li', 'main']:
-                    print(f"[Deep Scan] Tìm thấy khối semantic: <{parent_name}>")
                     return str(parent)
                 if parent_name == 'div':
                     class_list = parent.get('class', [])
                     if any(cls in ['content', 'post', 'article', 'story-body', 'caption'] for cls in class_list):
-                        print(f"[Deep Scan] Tìm thấy khối div quan trọng: {class_list}")
                         return str(parent)
                 current = parent
@@ -216,9 +216,9 @@ async def get_html_context_block_bs4(
             else:
                 return str(best_tag.parent)
         else:
-            print("[Deep Scan] Không tìm thấy ảnh khớp nào.")
             return ""
     except Exception as e:
-        print(f"❌ [Deep Scan] Lỗi khi phân tích HTML: {e}")
         return ""

             continue
         if not img_tags:
+            print("[Fast Scan] No <img> tag found.")
             continue
         for img_tag in img_tags:
                     best_url = url
                 if sim > 0.9:
+                    print(f"✅ [Fast Scan] FOUND MATCH > 0.9 (Sim: {sim:.4f}) at: {url}")
                     return url, sim
             except Exception as e:
                 pass
     if best_url:
+        print(f"ℹ️ [Fast Scan] No similarity > 0.9. Using best-match URL: {best_url} (Sim: {max_sim:.4f})")
         return best_url, max_sim
     if not best_url and urls:
+        print(f"ℹ️ [Fast Scan] No images found. Using the first URL as fallback.")
         return urls[0], 0.0
     return None, 0.0
     api_key: Optional[str],
     progress: gr.Progress
 ) -> str:
+    print(f"--- [Deep Scan] Fetching HTML block from: {url} ---")
     progress(0.6, desc="Phase 2/2: Deep scan (Fetching HTML block)...")
     html = await scrape_html_with_fallback(url, client, api_key)
     if not html:
+        print("[Deep Scan] HTML scrape failed.")
         return ""
     try:
                 pass
         if best_tag:
+            print(f"[Deep Scan] Found closest matching image (Sim: {max_sim:.4f}). Looking for parent block...")
             current = best_tag
             for _ in range(5):
                     break
                 parent_name = parent.name.lower()
                 if parent_name in ['article', 'section', 'li', 'main']:
+                    print(f"[Deep Scan] Found semantic block: <{parent_name}>")
                     return str(parent)
                 if parent_name == 'div':
                     class_list = parent.get('class', [])
                     if any(cls in ['content', 'post', 'article', 'story-body', 'caption'] for cls in class_list):
+                        print(f"[Deep Scan] Found important div block: {class_list}")
                         return str(parent)
                 current = parent
             else:
                 return str(best_tag.parent)
         else:
+            print("[Deep Scan] No matching images found.")
             return ""
     except Exception as e:
+        print(f"❌ [Deep Scan] Error parsing HTML: {e}")
         return ""

miragenews/merge_img_text.py CHANGED Viewed

@@ -5,45 +5,45 @@ from img.core import analyze_saved_images
 from text_module.pipeline import verify_text_logic
 from text_module.TextAnalysisResult import TextAnalysisResult
-# --- HELPER: BÓC TÁCH REPORT ---
 def parse_child_report(report_text):
     """
-    Dùng Regex lấy giá trị từng dòng cụ thể.
     """
     data = {
         "auth": "N/A", "tools": "Unknown", "synth": "N/A", "artifacts": ""
     }
     if not report_text: return data
-    # 1. Lấy Authenticity Assessment (Quan trọng nhất)
-    # Regex này chỉ lấy nội dung trên cùng 1 dòng sau dấu hai chấm
     auth_match = re.search(r"Authenticity Assessment:\s*(.+)", report_text)
     if auth_match:
         data["auth"] = auth_match.group(1).strip()
-    # 2. Lấy Tools
     tools_match = re.search(r"Verification Tools & Methods:\s*(.+)", report_text)
     if tools_match:
         data["tools"] = tools_match.group(1).strip()
-    # 3. Lấy Synthetic Type
     synth_match = re.search(r"Synthetic Type \(if applicable\):\s*(.+)", report_text)
     if synth_match:
         data["synth"] = synth_match.group(1).strip()
-    # 4. Lấy Artifacts (Lấy từ dòng đó xuống hết)
     art_match = re.search(r"Other Artifacts:\s*(.*)", report_text, re.DOTALL)
     if art_match:
         data["artifacts"] = art_match.group(1).strip()
     return data
-# --- HELPER: CHECK FAKE CHỈ TRÊN DÒNG ASSESSMENT ---
 def is_verdict_fake(assessment_string):
     if not assessment_string: return False
     s = assessment_string.lower().strip()
-    # Các từ khóa khẳng định là FAKE
     fake_keywords = ["not real", "fake", "manipulated", "generated", "artificial", "synthetic"]
     for kw in fake_keywords:
@@ -51,7 +51,7 @@ def is_verdict_fake(assessment_string):
             return True
     return False
-# --- HTML STATUS BAR (GIỮ NGUYÊN) ---
 def create_status_html(label, status, message):
     color = "#9ca3af"; percent = 5; icon = "⏳"; bg_pulse = ""; text_color = "#374151"
     if status == 'processing':
@@ -75,7 +75,7 @@ def create_status_html(label, status, message):
     """
     return html
-# --- TASK 1: XỬ LÝ ẢNH ---
 async def run_image_task(shared_state, image_input):
     shared_state['img_status'] = 'processing'
     shared_state['img_msg'] = "Scanning artifacts..."
@@ -89,10 +89,10 @@ async def run_image_task(shared_state, image_input):
         else:
             for res in gen: final_json, final_report_md = res
-        # Lưu toàn bộ chuỗi report vào artifact
         img_result_obj.set_other_artifacts(final_report_md)
-        # Parse lấy đúng dòng Auth để set status cho object (để dùng cho short-circuit nếu cần)
         parsed = parse_child_report(final_report_md)
         img_result_obj.set_authenticity_assessment(parsed["auth"])
@@ -104,7 +104,7 @@ async def run_image_task(shared_state, image_input):
         img_result_obj.set_authenticity_assessment("Error")
     return img_result_obj
-# --- TASK 2: XỬ LÝ TEXT ---
 async def run_text_task(shared_state, text_input):
     shared_state['txt_status'] = 'processing'
     shared_state['txt_msg'] = "Verifying logic..."
@@ -137,7 +137,7 @@ async def verify_multimodal_logic(image_state, text_input):
         if task_img.done() and img_res is None:
             try:
                 img_res = task_img.result()
-                # Check Fake chỉ dựa trên Assessment (ngắn gọn)
                 if is_verdict_fake(img_res.get_authenticity_assessment()):
                     if not task_txt.done(): task_txt.cancel(); shared_state['txt_msg'] = "Stopped (Image is Fake)"
                     break
@@ -146,7 +146,7 @@ async def verify_multimodal_logic(image_state, text_input):
         if task_txt.done() and txt_res is None:
             try:
                 txt_res = task_txt.result()
-                # Check Fake chỉ dựa trên Assessment
                 if is_verdict_fake(txt_res.get_authenticity_assessment()):
                     if not task_img.done(): task_img.cancel(); shared_state['img_msg'] = "Stopped (Text is Fake)"
                     break
@@ -159,17 +159,17 @@ async def verify_multimodal_logic(image_state, text_input):
     if not txt_res: txt_res = TextAnalysisResult(authenticity_assessment="Skipped")
     # =========================================================================
-    # LOGIC MERGE: CHỈ DỰA VÀO DÒNG ASSESSMENT
     # =========================================================================
-    # 1. Parse Image Report để lấy dòng "Authenticity Assessment" sạch
     img_data_parsed = parse_child_report(img_res.get_other_artifacts())
     img_auth_line = img_data_parsed["auth"] # VD: "🧑 REAL PHOTO"
-    # 2. Lấy dòng Assessment của Text
     txt_auth_line = txt_res.get_authenticity_assessment() # VD: "REAL (Authentic)"
-    # 3. KIỂM TRA FAKE/REAL (Dựa trên 2 dòng trên)
     img_is_fake = is_verdict_fake(img_auth_line)
     txt_is_fake = is_verdict_fake(txt_auth_line)
@@ -187,12 +187,12 @@ async def verify_multimodal_logic(image_state, text_input):
     # --- FIELD 3: Synthetic Type ---
     final_synth_list = []
-    # Chỉ lấy Synthetic Type từ module Ảnh nếu Ảnh bị kết luận là Fake
     if img_is_fake:
         s_type = img_data_parsed["synth"] if img_data_parsed["synth"] != "N/A" else "Manipulated Image"
         final_synth_list.append(f"**Image:** {s_type}")
-    # Chỉ lấy Synthetic Type từ module Text nếu Text bị kết luận là Fake
     if txt_is_fake:
         s_type = txt_res.get_synthetic_type()
         if not s_type or s_type == "N/A": s_type = "Generated Content"
@@ -200,36 +200,36 @@ async def verify_multimodal_logic(image_state, text_input):
     final_synth_str = "\n".join(final_synth_list) if final_synth_list else "N/A"
-    # --- FIELD 4: Other Artifacts (Logic hiển thị Source/Artifacts) ---
     final_artifacts_str = ""
-    # Case: Cả 2 Fake -> Show cả 2
     if img_is_fake and txt_is_fake:
         final_artifacts_str = f"**[Image Evidence]**\n{img_data_parsed['artifacts']}\n\n**[Text Evidence]**\n{txt_res.get_other_artifacts()}"
-    # Case: Chỉ Ảnh Fake -> Show ảnh
     elif img_is_fake:
         final_artifacts_str = f"{img_data_parsed['artifacts']}"
-    # Case: Chỉ Text Fake -> Show text
     elif txt_is_fake:
         final_artifacts_str = f"{txt_res.get_other_artifacts()}"
-    # Case: Cả 2 đều REAL -> Show source (nếu có)
     else:
         final_artifacts_str = "Both image and text are verified as authentic by our multi-modal pipeline."
-        # Check source ảnh (Khác N/A, khác rỗng)
         img_src = img_data_parsed.get('artifacts', '').strip()
         if img_src and img_src != "N/A" and "No details" not in img_src:
              final_artifacts_str += f"\n\n**For Image:** {img_src}"
-        # Check source text
         txt_src = txt_res.get_other_artifacts().strip()
         if txt_src and txt_src != "N/A":
              final_artifacts_str += f"\n\n**For Text:** {txt_src}"
-    # TẠO FINAL MARKDOWN
     final_report_md = f"""
 ### 📋 Final Verification Report

 from text_module.pipeline import verify_text_logic
 from text_module.TextAnalysisResult import TextAnalysisResult
+# --- HELPER: PARSE REPORT ---
 def parse_child_report(report_text):
     """
+    Use regex to extract each specific line value.
     """
     data = {
         "auth": "N/A", "tools": "Unknown", "synth": "N/A", "artifacts": ""
     }
     if not report_text: return data
+    # 1. Extract Authenticity Assessment (most important)
+    # This regex only grabs content on the same line after the colon
     auth_match = re.search(r"Authenticity Assessment:\s*(.+)", report_text)
     if auth_match:
         data["auth"] = auth_match.group(1).strip()
+    # 2. Extract Tools
     tools_match = re.search(r"Verification Tools & Methods:\s*(.+)", report_text)
     if tools_match:
         data["tools"] = tools_match.group(1).strip()
+    # 3. Extract Synthetic Type
     synth_match = re.search(r"Synthetic Type \(if applicable\):\s*(.+)", report_text)
     if synth_match:
         data["synth"] = synth_match.group(1).strip()
+    # 4. Extract Artifacts (from that line through the end)
     art_match = re.search(r"Other Artifacts:\s*(.*)", report_text, re.DOTALL)
     if art_match:
         data["artifacts"] = art_match.group(1).strip()
     return data
+# --- HELPER: CHECK FAKE USING ONLY ASSESSMENT LINE ---
 def is_verdict_fake(assessment_string):
     if not assessment_string: return False
     s = assessment_string.lower().strip()
+    # Keywords that indicate a fake verdict
     fake_keywords = ["not real", "fake", "manipulated", "generated", "artificial", "synthetic"]
     for kw in fake_keywords:
             return True
     return False
+# --- HTML STATUS BAR (KEEP LOGIC) ---
 def create_status_html(label, status, message):
     color = "#9ca3af"; percent = 5; icon = "⏳"; bg_pulse = ""; text_color = "#374151"
     if status == 'processing':
     """
     return html
+# --- TASK 1: PROCESS IMAGES ---
 async def run_image_task(shared_state, image_input):
     shared_state['img_status'] = 'processing'
     shared_state['img_msg'] = "Scanning artifacts..."
         else:
             for res in gen: final_json, final_report_md = res
+        # Save the full report string into artifacts
         img_result_obj.set_other_artifacts(final_report_md)
+        # Parse the Auth line to update status (used for possible short-circuit)
         parsed = parse_child_report(final_report_md)
         img_result_obj.set_authenticity_assessment(parsed["auth"])
         img_result_obj.set_authenticity_assessment("Error")
     return img_result_obj
+# --- TASK 2: PROCESS TEXT ---
 async def run_text_task(shared_state, text_input):
     shared_state['txt_status'] = 'processing'
     shared_state['txt_msg'] = "Verifying logic..."
         if task_img.done() and img_res is None:
             try:
                 img_res = task_img.result()
+                # Check fake verdict using only the Assessment line (short-circuit)
                 if is_verdict_fake(img_res.get_authenticity_assessment()):
                     if not task_txt.done(): task_txt.cancel(); shared_state['txt_msg'] = "Stopped (Image is Fake)"
                     break
         if task_txt.done() and txt_res is None:
             try:
                 txt_res = task_txt.result()
+                # Check fake verdict using only the Assessment line
                 if is_verdict_fake(txt_res.get_authenticity_assessment()):
                     if not task_img.done(): task_img.cancel(); shared_state['img_msg'] = "Stopped (Text is Fake)"
                     break
     if not txt_res: txt_res = TextAnalysisResult(authenticity_assessment="Skipped")
     # =========================================================================
+    # MERGE LOGIC: BASED ONLY ON THE ASSESSMENT LINE
     # =========================================================================
+    # 1. Parse Image Report to extract a clean "Authenticity Assessment" line
     img_data_parsed = parse_child_report(img_res.get_other_artifacts())
     img_auth_line = img_data_parsed["auth"] # VD: "🧑 REAL PHOTO"
+    # 2. Get the Assessment line for Text
     txt_auth_line = txt_res.get_authenticity_assessment() # VD: "REAL (Authentic)"
+    # 3. Determine fake/real based on those two lines
     img_is_fake = is_verdict_fake(img_auth_line)
     txt_is_fake = is_verdict_fake(txt_auth_line)
     # --- FIELD 3: Synthetic Type ---
     final_synth_list = []
+    # Only pull Synthetic Type from Image module if Image is deemed Fake
     if img_is_fake:
         s_type = img_data_parsed["synth"] if img_data_parsed["synth"] != "N/A" else "Manipulated Image"
         final_synth_list.append(f"**Image:** {s_type}")
+    # Only pull Synthetic Type from Text module if Text is deemed Fake
     if txt_is_fake:
         s_type = txt_res.get_synthetic_type()
         if not s_type or s_type == "N/A": s_type = "Generated Content"
     final_synth_str = "\n".join(final_synth_list) if final_synth_list else "N/A"
+    # --- FIELD 4: Other Artifacts (Display source/artifacts logic) ---
     final_artifacts_str = ""
+    # Case: both are Fake -> show both
     if img_is_fake and txt_is_fake:
         final_artifacts_str = f"**[Image Evidence]**\n{img_data_parsed['artifacts']}\n\n**[Text Evidence]**\n{txt_res.get_other_artifacts()}"
+    # Case: only Image is Fake -> show image evidence
     elif img_is_fake:
         final_artifacts_str = f"{img_data_parsed['artifacts']}"
+    # Case: only Text is Fake -> show text evidence
     elif txt_is_fake:
         final_artifacts_str = f"{txt_res.get_other_artifacts()}"
+    # Case: both are REAL -> show source if available
     else:
         final_artifacts_str = "Both image and text are verified as authentic by our multi-modal pipeline."
+        # Check image source (non-empty and not N/A)
         img_src = img_data_parsed.get('artifacts', '').strip()
         if img_src and img_src != "N/A" and "No details" not in img_src:
              final_artifacts_str += f"\n\n**For Image:** {img_src}"
+        # Check text source
         txt_src = txt_res.get_other_artifacts().strip()
         if txt_src and txt_src != "N/A":
              final_artifacts_str += f"\n\n**For Text:** {txt_src}"
+    # BUILD FINAL MARKDOWN
     final_report_md = f"""
 ### 📋 Final Verification Report

miragenews/test_single_pair.py CHANGED Viewed

@@ -107,19 +107,19 @@ if __name__ == "__main__":
     else:
         print(f"\nFailed to process {input_pt_path_single}.")
-    print("\n" + "="*50 + "\n") # Thêm dòng phân cách
-    # --- VÍ DỤ XỬ LÝ NHIỀU FILE ---
     pt_files_to_check = [
-        "encodings/predictions/image/merged/my_single_image_dir/real.pt", # Thay bằng đường dẫn file thật
-        # "encodings/predictions/image/merged/another_dir/fake_image.pt", # ĐÃ XÓA DÒNG NÀY
-        "path/to/nonexistent.pt" # Ví dụ file không tồn tại
     ]
     print("\n--- Processing multiple files ---")
     results = {}
     for file_path in pt_files_to_check:
         prob_fake, label = predict_authenticity_from_pt(file_path, mirage_img, device)
-        results[file_path] = (prob_fake, label) # Lưu kết quả vào dictionary
     print("\n--- Summary ---")
     for file, (prob_fake, label) in results.items():

     else:
         print(f"\nFailed to process {input_pt_path_single}.")
+    print("\n" + "="*50 + "\n") # Add a divider line
+    # --- EXAMPLE: PROCESS MULTIPLE FILES ---
     pt_files_to_check = [
+        "encodings/predictions/image/merged/my_single_image_dir/real.pt", # Replace with the real file path
+        # "encodings/predictions/image/merged/another_dir/fake_image.pt", # THIS LINE WAS REMOVED
+        "path/to/nonexistent.pt" # Example of a missing file
     ]
     print("\n--- Processing multiple files ---")
     results = {}
     for file_path in pt_files_to_check:
         prob_fake, label = predict_authenticity_from_pt(file_path, mirage_img, device)
+        results[file_path] = (prob_fake, label) # Store results in a dictionary
     print("\n--- Summary ---")
     for file, (prob_fake, label) in results.items():

miragenews/text_module/config.py CHANGED Viewed

@@ -5,9 +5,9 @@ from dotenv import load_dotenv
 load_dotenv()
 # API Keys
-GOOGLE_API_KEY = os.getenv("GOOGLE_CSE_CX") # Dùng cho Gemini (theo code cũ của bạn)
 GOOGLE_SAFE_BROWSING_API_KEY = os.getenv("GOOGLE_SAFE_BROWSING_API_KEY")
-GOOGLE_CX = os.getenv("GOOGLE_CSE_CX") # Dùng cho Search
 SCRAPINGDOG_API_KEY = os.getenv("SCRAPINGDOG_API_KEY")
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

 load_dotenv()
 # API Keys
+GOOGLE_API_KEY = os.getenv("GOOGLE_CSE_CX") # Used for Gemini (per your previous code)
 GOOGLE_SAFE_BROWSING_API_KEY = os.getenv("GOOGLE_SAFE_BROWSING_API_KEY")
+GOOGLE_CX = os.getenv("GOOGLE_CSE_CX") # Used for search
 SCRAPINGDOG_API_KEY = os.getenv("SCRAPINGDOG_API_KEY")
 GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

miragenews/text_module/llm_utils.py CHANGED Viewed

@@ -61,7 +61,7 @@ def ask_llm_to_rewrite(text_content):
             max_output_tokens=MAX_TOKENS
         )
-        # 4. Gọi hàm generate_content
         response = flash_model.generate_content(
             full_prompt,
             generation_config=config

             max_output_tokens=MAX_TOKENS
         )
+        # 4. Call generate_content
         response = flash_model.generate_content(
             full_prompt,
             generation_config=config