Lucii1 commited on
Commit
ecccf5c
·
1 Parent(s): 11d339f

refactor code

Browse files
AIGVDet/main.py CHANGED
@@ -15,10 +15,10 @@ def run_video_to_json(
15
  optical_root: str = "optical_result"
16
  ) -> Dict:
17
  """
18
- Xử 1 video ghi kết quả ra file JSON.
19
 
20
  Returns:
21
- dict kết quả (đồng thời ghi ra JSON)
22
  """
23
 
24
  script_dir = os.path.dirname(os.path.abspath(__file__))
 
15
  optical_root: str = "optical_result"
16
  ) -> Dict:
17
  """
18
+ Process a single video and write the result to a JSON file.
19
 
20
  Returns:
21
+ result dict (and optionally writes to JSON)
22
  """
23
 
24
  script_dir = os.path.dirname(os.path.abspath(__file__))
api_server.py CHANGED
@@ -25,14 +25,14 @@ cred_json = os.getenv("GOOGLE_CREDENTIALS_JSON")
25
  if ENV == "hf":
26
  if cred_json:
27
  try:
28
- # Parse để đảm bảo JSON hợp lệ
29
  json.loads(cred_json)
30
 
31
  file_path = "google-credentials.json"
32
  with open(file_path, "w") as f:
33
  f.write(cred_json)
34
 
35
- # Set lại env để google auth tự nhận
36
  os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = file_path
37
 
38
  print("[INFO] Google credentials saved to", file_path)
 
25
  if ENV == "hf":
26
  if cred_json:
27
  try:
28
+ # Parse to ensure the JSON payload is valid
29
  json.loads(cred_json)
30
 
31
  file_path = "google-credentials.json"
32
  with open(file_path, "w") as f:
33
  f.write(cred_json)
34
 
35
+ # Reset env so Google auth can auto-detect the credentials
36
  os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = file_path
37
 
38
  print("[INFO] Google credentials saved to", file_path)
miragenews/data/encode_predictions.py CHANGED
@@ -111,11 +111,11 @@ def main(mode, model_class, custom=False, img_dirs=None, text_dirs=None, batch_s
111
 
112
  if mode == "image":
113
  # Load shared processors and models
114
- # Sửa lỗi OOM cho cbm-encoder
115
  object_processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
116
  object_detector = Owlv2ForObjectDetection.from_pretrained(
117
  "google/owlv2-base-patch16-ensemble",
118
- torch_dtype=torch.float16 # Thêm float16
119
  ).to(device)
120
  image_processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
121
  image_encoder = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl").vision_model.to(device)
@@ -176,16 +176,16 @@ def main(mode, model_class, custom=False, img_dirs=None, text_dirs=None, batch_s
176
  print(f"Predictions for fake images in {read_dir} saved.")
177
 
178
  # ==================================================================
179
- # === BẮT ĐẦU KHỐI CODE MỚI ĐỂ XỬ LÝ TEXT TÙY CHỈNH ===
180
  # ==================================================================
181
  elif text_dirs:
182
  for read_dir in text_dirs:
183
- # Logic này chỉ chạy cho 'linear' TBM (18-dim) không khả thi cho custom text
184
  if model_class != "linear":
185
  print(f"Warning: Only 'linear' model class is supported for custom text dirs. Skipping '{model_class}'.")
186
- continue # Bỏ qua nếu model class không phải 'linear'
187
 
188
- for label in ["real", "fake"]: # Xử cả 'real' 'fake' nếu thư mục tồn tại
189
  text_dir = os.path.join("my_dataset/text", read_dir, label)
190
  batch = []
191
  predictions = []
@@ -194,7 +194,7 @@ def main(mode, model_class, custom=False, img_dirs=None, text_dirs=None, batch_s
194
  print(f"Processing directory: {text_dir}")
195
  for text_name in tqdm(sorted(os.listdir(text_dir)), desc=f"Processing {text_dir} with {model_class}"):
196
  text_path = os.path.join(text_dir, text_name)
197
- # Đọc nội dung file text
198
  try:
199
  with open(text_path, 'r', encoding='utf-8') as f:
200
  text = f.read()
@@ -203,25 +203,25 @@ def main(mode, model_class, custom=False, img_dirs=None, text_dirs=None, batch_s
203
  continue
204
 
205
  batch.append(text)
206
- # Xử batch khi đầy
207
  if len(batch) == batch_size:
208
  text_encoding = preprocess_texts(batch, clip_model, device)
209
  predictions.append(process_txt_linear(model, text_encoding, device))
210
  batch = []
211
 
212
- # Xử batch còn sót lại
213
  if batch:
214
  text_encoding = preprocess_texts(batch, clip_model, device)
215
  predictions.append(process_txt_linear(model, text_encoding, device))
216
 
217
- # Lưu file .pt nếu dự đoán
218
  if predictions:
219
  save_predictions(torch.cat(predictions), output_dir, mode, model_class, read_dir, label)
220
  print(f"Predictions for {label} texts in {read_dir} saved.")
221
  else:
222
  print(f"Directory not found, skipping: {text_dir}")
223
  # ==================================================================
224
- # === KẾT THÚC KHỐI CODE MỚI ===
225
  # ==================================================================
226
 
227
  else:
@@ -229,12 +229,12 @@ def main(mode, model_class, custom=False, img_dirs=None, text_dirs=None, batch_s
229
  dataset_name = "anson-huang/mirage-news"
230
  available_splits = list(load_dataset(dataset_name).keys())
231
  if test_only:
232
- # Lấy 5 split test đầu tiên
233
  available_splits = [s for s in available_splits if s.startswith('test')]
234
 
235
  for split in available_splits:
236
  if split not in ['train', 'validation'] and not test_only:
237
- continue # Bỏ qua các split test nếu không cờ test_only
238
 
239
  dataset = load_dataset(dataset_name, split=split)
240
  for label in ["real", "fake"]:
 
111
 
112
  if mode == "image":
113
  # Load shared processors and models
114
+ # Fix OOM issues for cbm-encoder
115
  object_processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
116
  object_detector = Owlv2ForObjectDetection.from_pretrained(
117
  "google/owlv2-base-patch16-ensemble",
118
+ torch_dtype=torch.float16 # Add float16
119
  ).to(device)
120
  image_processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
121
  image_encoder = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl").vision_model.to(device)
 
176
  print(f"Predictions for fake images in {read_dir} saved.")
177
 
178
  # ==================================================================
179
+ # === START NEW BLOCK TO HANDLE CUSTOM TEXT ===
180
  # ==================================================================
181
  elif text_dirs:
182
  for read_dir in text_dirs:
183
+ # This logic only runs for 'linear' because TBM (18-dim) is not feasible for custom text
184
  if model_class != "linear":
185
  print(f"Warning: Only 'linear' model class is supported for custom text dirs. Skipping '{model_class}'.")
186
+ continue # Skip if the model class is not 'linear'
187
 
188
+ for label in ["real", "fake"]: # Handle both 'real' and 'fake' folders when present
189
  text_dir = os.path.join("my_dataset/text", read_dir, label)
190
  batch = []
191
  predictions = []
 
194
  print(f"Processing directory: {text_dir}")
195
  for text_name in tqdm(sorted(os.listdir(text_dir)), desc=f"Processing {text_dir} with {model_class}"):
196
  text_path = os.path.join(text_dir, text_name)
197
+ # Read text file contents
198
  try:
199
  with open(text_path, 'r', encoding='utf-8') as f:
200
  text = f.read()
 
203
  continue
204
 
205
  batch.append(text)
206
+ # Process batch when it reaches capacity
207
  if len(batch) == batch_size:
208
  text_encoding = preprocess_texts(batch, clip_model, device)
209
  predictions.append(process_txt_linear(model, text_encoding, device))
210
  batch = []
211
 
212
+ # Process any remaining batch
213
  if batch:
214
  text_encoding = preprocess_texts(batch, clip_model, device)
215
  predictions.append(process_txt_linear(model, text_encoding, device))
216
 
217
+ # Save .pt file if predictions exist
218
  if predictions:
219
  save_predictions(torch.cat(predictions), output_dir, mode, model_class, read_dir, label)
220
  print(f"Predictions for {label} texts in {read_dir} saved.")
221
  else:
222
  print(f"Directory not found, skipping: {text_dir}")
223
  # ==================================================================
224
+ # === END OF NEW BLOCK ===
225
  # ==================================================================
226
 
227
  else:
 
229
  dataset_name = "anson-huang/mirage-news"
230
  available_splits = list(load_dataset(dataset_name).keys())
231
  if test_only:
232
+ # Use the first 5 test splits only
233
  available_splits = [s for s in available_splits if s.startswith('test')]
234
 
235
  for split in available_splits:
236
  if split not in ['train', 'validation'] and not test_only:
237
+ continue # Skip test splits when test_only flag is not set
238
 
239
  dataset = load_dataset(dataset_name, split=split)
240
  for label in ["real", "fake"]:
miragenews/img/resources.py CHANGED
@@ -4,7 +4,7 @@ from google.cloud import vision
4
  from sentence_transformers import SentenceTransformer
5
  import os
6
 
7
- # Import local modules (giữ nguyên logic cũ của bạn)
8
 
9
  from .semantic_filter import SemanticFilter
10
  from miragenews.models import get_model
 
4
  from sentence_transformers import SentenceTransformer
5
  import os
6
 
7
+ # Import local modules (keep your existing logic)
8
 
9
  from .semantic_filter import SemanticFilter
10
  from miragenews.models import get_model
miragenews/img/web_utils.py CHANGED
@@ -98,7 +98,7 @@ async def find_best_url_fast_scan_bs4(
98
  continue
99
 
100
  if not img_tags:
101
- print("[Fast Scan] Không tìm thấy thẻ <img>.")
102
  continue
103
 
104
  for img_tag in img_tags:
@@ -124,18 +124,18 @@ async def find_best_url_fast_scan_bs4(
124
  best_url = url
125
 
126
  if sim > 0.9:
127
- print(f"✅ [Fast Scan] TÌM THẤY KHỚP > 0.9 (Sim: {sim:.4f}) tại: {url}")
128
  return url, sim
129
 
130
  except Exception as e:
131
  pass
132
 
133
  if best_url:
134
- print(f"ℹ️ [Fast Scan] Không tìm thấy > 0.9. Chọn URL khớp nhất: {best_url} (Sim: {max_sim:.4f})")
135
  return best_url, max_sim
136
 
137
  if not best_url and urls:
138
- print(f"ℹ️ [Fast Scan] Không tìm thấy ảnh nào. Chọn URL đầu tiên làm dự phòng.")
139
  return urls[0], 0.0
140
 
141
  return None, 0.0
@@ -150,12 +150,12 @@ async def get_html_context_block_bs4(
150
  api_key: Optional[str],
151
  progress: gr.Progress
152
  ) -> str:
153
- print(f"--- [Deep Scan] Lấy khối HTML từ: {url} ---")
154
  progress(0.6, desc="Phase 2/2: Deep scan (Fetching HTML block)...")
155
 
156
  html = await scrape_html_with_fallback(url, client, api_key)
157
  if not html:
158
- print("[Deep Scan] Scrape HTML thất bại.")
159
  return ""
160
 
161
  try:
@@ -191,7 +191,7 @@ async def get_html_context_block_bs4(
191
  pass
192
 
193
  if best_tag:
194
- print(f"[Deep Scan] Tìm thấy ảnh khớp nhất (Sim: {max_sim:.4f}). Đang tìm khối cha...")
195
 
196
  current = best_tag
197
  for _ in range(5):
@@ -200,12 +200,12 @@ async def get_html_context_block_bs4(
200
  break
201
  parent_name = parent.name.lower()
202
  if parent_name in ['article', 'section', 'li', 'main']:
203
- print(f"[Deep Scan] Tìm thấy khối semantic: <{parent_name}>")
204
  return str(parent)
205
  if parent_name == 'div':
206
  class_list = parent.get('class', [])
207
  if any(cls in ['content', 'post', 'article', 'story-body', 'caption'] for cls in class_list):
208
- print(f"[Deep Scan] Tìm thấy khối div quan trọng: {class_list}")
209
  return str(parent)
210
  current = parent
211
 
@@ -216,9 +216,9 @@ async def get_html_context_block_bs4(
216
  else:
217
  return str(best_tag.parent)
218
  else:
219
- print("[Deep Scan] Không tìm thấy ảnh khớp nào.")
220
  return ""
221
  except Exception as e:
222
- print(f"❌ [Deep Scan] Lỗi khi phân tích HTML: {e}")
223
  return ""
224
 
 
98
  continue
99
 
100
  if not img_tags:
101
+ print("[Fast Scan] No <img> tag found.")
102
  continue
103
 
104
  for img_tag in img_tags:
 
124
  best_url = url
125
 
126
  if sim > 0.9:
127
+ print(f"✅ [Fast Scan] FOUND MATCH > 0.9 (Sim: {sim:.4f}) at: {url}")
128
  return url, sim
129
 
130
  except Exception as e:
131
  pass
132
 
133
  if best_url:
134
+ print(f"ℹ️ [Fast Scan] No similarity > 0.9. Using best-match URL: {best_url} (Sim: {max_sim:.4f})")
135
  return best_url, max_sim
136
 
137
  if not best_url and urls:
138
+ print(f"ℹ️ [Fast Scan] No images found. Using the first URL as fallback.")
139
  return urls[0], 0.0
140
 
141
  return None, 0.0
 
150
  api_key: Optional[str],
151
  progress: gr.Progress
152
  ) -> str:
153
+ print(f"--- [Deep Scan] Fetching HTML block from: {url} ---")
154
  progress(0.6, desc="Phase 2/2: Deep scan (Fetching HTML block)...")
155
 
156
  html = await scrape_html_with_fallback(url, client, api_key)
157
  if not html:
158
+ print("[Deep Scan] HTML scrape failed.")
159
  return ""
160
 
161
  try:
 
191
  pass
192
 
193
  if best_tag:
194
+ print(f"[Deep Scan] Found closest matching image (Sim: {max_sim:.4f}). Looking for parent block...")
195
 
196
  current = best_tag
197
  for _ in range(5):
 
200
  break
201
  parent_name = parent.name.lower()
202
  if parent_name in ['article', 'section', 'li', 'main']:
203
+ print(f"[Deep Scan] Found semantic block: <{parent_name}>")
204
  return str(parent)
205
  if parent_name == 'div':
206
  class_list = parent.get('class', [])
207
  if any(cls in ['content', 'post', 'article', 'story-body', 'caption'] for cls in class_list):
208
+ print(f"[Deep Scan] Found important div block: {class_list}")
209
  return str(parent)
210
  current = parent
211
 
 
216
  else:
217
  return str(best_tag.parent)
218
  else:
219
+ print("[Deep Scan] No matching images found.")
220
  return ""
221
  except Exception as e:
222
+ print(f"❌ [Deep Scan] Error parsing HTML: {e}")
223
  return ""
224
 
miragenews/merge_img_text.py CHANGED
@@ -5,45 +5,45 @@ from img.core import analyze_saved_images
5
  from text_module.pipeline import verify_text_logic
6
  from text_module.TextAnalysisResult import TextAnalysisResult
7
 
8
- # --- HELPER: BÓC TÁCH REPORT ---
9
  def parse_child_report(report_text):
10
  """
11
- Dùng Regex lấy giá trị từng dòng cụ thể.
12
  """
13
  data = {
14
  "auth": "N/A", "tools": "Unknown", "synth": "N/A", "artifacts": ""
15
  }
16
  if not report_text: return data
17
 
18
- # 1. Lấy Authenticity Assessment (Quan trọng nhất)
19
- # Regex này chỉ lấy nội dung trên cùng 1 dòng sau dấu hai chấm
20
  auth_match = re.search(r"Authenticity Assessment:\s*(.+)", report_text)
21
  if auth_match:
22
  data["auth"] = auth_match.group(1).strip()
23
 
24
- # 2. Lấy Tools
25
  tools_match = re.search(r"Verification Tools & Methods:\s*(.+)", report_text)
26
  if tools_match:
27
  data["tools"] = tools_match.group(1).strip()
28
 
29
- # 3. Lấy Synthetic Type
30
  synth_match = re.search(r"Synthetic Type \(if applicable\):\s*(.+)", report_text)
31
  if synth_match:
32
  data["synth"] = synth_match.group(1).strip()
33
 
34
- # 4. Lấy Artifacts (Lấy từ dòng đó xuống hết)
35
  art_match = re.search(r"Other Artifacts:\s*(.*)", report_text, re.DOTALL)
36
  if art_match:
37
  data["artifacts"] = art_match.group(1).strip()
38
 
39
  return data
40
 
41
- # --- HELPER: CHECK FAKE CHỈ TRÊN DÒNG ASSESSMENT ---
42
  def is_verdict_fake(assessment_string):
43
  if not assessment_string: return False
44
  s = assessment_string.lower().strip()
45
 
46
- # Các từ khóa khẳng định là FAKE
47
  fake_keywords = ["not real", "fake", "manipulated", "generated", "artificial", "synthetic"]
48
 
49
  for kw in fake_keywords:
@@ -51,7 +51,7 @@ def is_verdict_fake(assessment_string):
51
  return True
52
  return False
53
 
54
- # --- HTML STATUS BAR (GIỮ NGUYÊN) ---
55
  def create_status_html(label, status, message):
56
  color = "#9ca3af"; percent = 5; icon = "⏳"; bg_pulse = ""; text_color = "#374151"
57
  if status == 'processing':
@@ -75,7 +75,7 @@ def create_status_html(label, status, message):
75
  """
76
  return html
77
 
78
- # --- TASK 1: XỬ ẢNH ---
79
  async def run_image_task(shared_state, image_input):
80
  shared_state['img_status'] = 'processing'
81
  shared_state['img_msg'] = "Scanning artifacts..."
@@ -89,10 +89,10 @@ async def run_image_task(shared_state, image_input):
89
  else:
90
  for res in gen: final_json, final_report_md = res
91
 
92
- # Lưu toàn bộ chuỗi report vào artifact
93
  img_result_obj.set_other_artifacts(final_report_md)
94
 
95
- # Parse lấy đúng dòng Auth để set status cho object (để dùng cho short-circuit nếu cần)
96
  parsed = parse_child_report(final_report_md)
97
  img_result_obj.set_authenticity_assessment(parsed["auth"])
98
 
@@ -104,7 +104,7 @@ async def run_image_task(shared_state, image_input):
104
  img_result_obj.set_authenticity_assessment("Error")
105
  return img_result_obj
106
 
107
- # --- TASK 2: XỬ TEXT ---
108
  async def run_text_task(shared_state, text_input):
109
  shared_state['txt_status'] = 'processing'
110
  shared_state['txt_msg'] = "Verifying logic..."
@@ -137,7 +137,7 @@ async def verify_multimodal_logic(image_state, text_input):
137
  if task_img.done() and img_res is None:
138
  try:
139
  img_res = task_img.result()
140
- # Check Fake chỉ dựa trên Assessment (ngắn gọn)
141
  if is_verdict_fake(img_res.get_authenticity_assessment()):
142
  if not task_txt.done(): task_txt.cancel(); shared_state['txt_msg'] = "Stopped (Image is Fake)"
143
  break
@@ -146,7 +146,7 @@ async def verify_multimodal_logic(image_state, text_input):
146
  if task_txt.done() and txt_res is None:
147
  try:
148
  txt_res = task_txt.result()
149
- # Check Fake chỉ dựa trên Assessment
150
  if is_verdict_fake(txt_res.get_authenticity_assessment()):
151
  if not task_img.done(): task_img.cancel(); shared_state['img_msg'] = "Stopped (Text is Fake)"
152
  break
@@ -159,17 +159,17 @@ async def verify_multimodal_logic(image_state, text_input):
159
  if not txt_res: txt_res = TextAnalysisResult(authenticity_assessment="Skipped")
160
 
161
  # =========================================================================
162
- # LOGIC MERGE: CHỈ DỰA VÀO DÒNG ASSESSMENT
163
  # =========================================================================
164
 
165
- # 1. Parse Image Report để lấy dòng "Authenticity Assessment" sạch
166
  img_data_parsed = parse_child_report(img_res.get_other_artifacts())
167
  img_auth_line = img_data_parsed["auth"] # VD: "🧑 REAL PHOTO"
168
 
169
- # 2. Lấy dòng Assessment của Text
170
  txt_auth_line = txt_res.get_authenticity_assessment() # VD: "REAL (Authentic)"
171
 
172
- # 3. KIỂM TRA FAKE/REAL (Dựa trên 2 dòng trên)
173
  img_is_fake = is_verdict_fake(img_auth_line)
174
  txt_is_fake = is_verdict_fake(txt_auth_line)
175
 
@@ -187,12 +187,12 @@ async def verify_multimodal_logic(image_state, text_input):
187
  # --- FIELD 3: Synthetic Type ---
188
  final_synth_list = []
189
 
190
- # Chỉ lấy Synthetic Type từ module Ảnh nếu Ảnh bị kết luận là Fake
191
  if img_is_fake:
192
  s_type = img_data_parsed["synth"] if img_data_parsed["synth"] != "N/A" else "Manipulated Image"
193
  final_synth_list.append(f"**Image:** {s_type}")
194
 
195
- # Chỉ lấy Synthetic Type từ module Text nếu Text bị kết luận là Fake
196
  if txt_is_fake:
197
  s_type = txt_res.get_synthetic_type()
198
  if not s_type or s_type == "N/A": s_type = "Generated Content"
@@ -200,36 +200,36 @@ async def verify_multimodal_logic(image_state, text_input):
200
 
201
  final_synth_str = "\n".join(final_synth_list) if final_synth_list else "N/A"
202
 
203
- # --- FIELD 4: Other Artifacts (Logic hiển thị Source/Artifacts) ---
204
  final_artifacts_str = ""
205
 
206
- # Case: Cả 2 Fake -> Show cả 2
207
  if img_is_fake and txt_is_fake:
208
  final_artifacts_str = f"**[Image Evidence]**\n{img_data_parsed['artifacts']}\n\n**[Text Evidence]**\n{txt_res.get_other_artifacts()}"
209
 
210
- # Case: Chỉ Ảnh Fake -> Show ảnh
211
  elif img_is_fake:
212
  final_artifacts_str = f"{img_data_parsed['artifacts']}"
213
 
214
- # Case: Chỉ Text Fake -> Show text
215
  elif txt_is_fake:
216
  final_artifacts_str = f"{txt_res.get_other_artifacts()}"
217
 
218
- # Case: Cả 2 đều REAL -> Show source (nếu có)
219
  else:
220
  final_artifacts_str = "Both image and text are verified as authentic by our multi-modal pipeline."
221
 
222
- # Check source ảnh (Khác N/A, khác rỗng)
223
  img_src = img_data_parsed.get('artifacts', '').strip()
224
  if img_src and img_src != "N/A" and "No details" not in img_src:
225
  final_artifacts_str += f"\n\n**For Image:** {img_src}"
226
 
227
- # Check source text
228
  txt_src = txt_res.get_other_artifacts().strip()
229
  if txt_src and txt_src != "N/A":
230
  final_artifacts_str += f"\n\n**For Text:** {txt_src}"
231
 
232
- # TẠO FINAL MARKDOWN
233
  final_report_md = f"""
234
  ### 📋 Final Verification Report
235
 
 
5
  from text_module.pipeline import verify_text_logic
6
  from text_module.TextAnalysisResult import TextAnalysisResult
7
 
8
+ # --- HELPER: PARSE REPORT ---
9
  def parse_child_report(report_text):
10
  """
11
+ Use regex to extract each specific line value.
12
  """
13
  data = {
14
  "auth": "N/A", "tools": "Unknown", "synth": "N/A", "artifacts": ""
15
  }
16
  if not report_text: return data
17
 
18
+ # 1. Extract Authenticity Assessment (most important)
19
+ # This regex only grabs content on the same line after the colon
20
  auth_match = re.search(r"Authenticity Assessment:\s*(.+)", report_text)
21
  if auth_match:
22
  data["auth"] = auth_match.group(1).strip()
23
 
24
+ # 2. Extract Tools
25
  tools_match = re.search(r"Verification Tools & Methods:\s*(.+)", report_text)
26
  if tools_match:
27
  data["tools"] = tools_match.group(1).strip()
28
 
29
+ # 3. Extract Synthetic Type
30
  synth_match = re.search(r"Synthetic Type \(if applicable\):\s*(.+)", report_text)
31
  if synth_match:
32
  data["synth"] = synth_match.group(1).strip()
33
 
34
+ # 4. Extract Artifacts (from that line through the end)
35
  art_match = re.search(r"Other Artifacts:\s*(.*)", report_text, re.DOTALL)
36
  if art_match:
37
  data["artifacts"] = art_match.group(1).strip()
38
 
39
  return data
40
 
41
+ # --- HELPER: CHECK FAKE USING ONLY ASSESSMENT LINE ---
42
  def is_verdict_fake(assessment_string):
43
  if not assessment_string: return False
44
  s = assessment_string.lower().strip()
45
 
46
+ # Keywords that indicate a fake verdict
47
  fake_keywords = ["not real", "fake", "manipulated", "generated", "artificial", "synthetic"]
48
 
49
  for kw in fake_keywords:
 
51
  return True
52
  return False
53
 
54
+ # --- HTML STATUS BAR (KEEP LOGIC) ---
55
  def create_status_html(label, status, message):
56
  color = "#9ca3af"; percent = 5; icon = "⏳"; bg_pulse = ""; text_color = "#374151"
57
  if status == 'processing':
 
75
  """
76
  return html
77
 
78
+ # --- TASK 1: PROCESS IMAGES ---
79
  async def run_image_task(shared_state, image_input):
80
  shared_state['img_status'] = 'processing'
81
  shared_state['img_msg'] = "Scanning artifacts..."
 
89
  else:
90
  for res in gen: final_json, final_report_md = res
91
 
92
+ # Save the full report string into artifacts
93
  img_result_obj.set_other_artifacts(final_report_md)
94
 
95
+ # Parse the Auth line to update status (used for possible short-circuit)
96
  parsed = parse_child_report(final_report_md)
97
  img_result_obj.set_authenticity_assessment(parsed["auth"])
98
 
 
104
  img_result_obj.set_authenticity_assessment("Error")
105
  return img_result_obj
106
 
107
+ # --- TASK 2: PROCESS TEXT ---
108
  async def run_text_task(shared_state, text_input):
109
  shared_state['txt_status'] = 'processing'
110
  shared_state['txt_msg'] = "Verifying logic..."
 
137
  if task_img.done() and img_res is None:
138
  try:
139
  img_res = task_img.result()
140
+ # Check fake verdict using only the Assessment line (short-circuit)
141
  if is_verdict_fake(img_res.get_authenticity_assessment()):
142
  if not task_txt.done(): task_txt.cancel(); shared_state['txt_msg'] = "Stopped (Image is Fake)"
143
  break
 
146
  if task_txt.done() and txt_res is None:
147
  try:
148
  txt_res = task_txt.result()
149
+ # Check fake verdict using only the Assessment line
150
  if is_verdict_fake(txt_res.get_authenticity_assessment()):
151
  if not task_img.done(): task_img.cancel(); shared_state['img_msg'] = "Stopped (Text is Fake)"
152
  break
 
159
  if not txt_res: txt_res = TextAnalysisResult(authenticity_assessment="Skipped")
160
 
161
  # =========================================================================
162
+ # MERGE LOGIC: BASED ONLY ON THE ASSESSMENT LINE
163
  # =========================================================================
164
 
165
+ # 1. Parse Image Report to extract a clean "Authenticity Assessment" line
166
  img_data_parsed = parse_child_report(img_res.get_other_artifacts())
167
  img_auth_line = img_data_parsed["auth"] # VD: "🧑 REAL PHOTO"
168
 
169
+ # 2. Get the Assessment line for Text
170
  txt_auth_line = txt_res.get_authenticity_assessment() # VD: "REAL (Authentic)"
171
 
172
+ # 3. Determine fake/real based on those two lines
173
  img_is_fake = is_verdict_fake(img_auth_line)
174
  txt_is_fake = is_verdict_fake(txt_auth_line)
175
 
 
187
  # --- FIELD 3: Synthetic Type ---
188
  final_synth_list = []
189
 
190
+ # Only pull Synthetic Type from Image module if Image is deemed Fake
191
  if img_is_fake:
192
  s_type = img_data_parsed["synth"] if img_data_parsed["synth"] != "N/A" else "Manipulated Image"
193
  final_synth_list.append(f"**Image:** {s_type}")
194
 
195
+ # Only pull Synthetic Type from Text module if Text is deemed Fake
196
  if txt_is_fake:
197
  s_type = txt_res.get_synthetic_type()
198
  if not s_type or s_type == "N/A": s_type = "Generated Content"
 
200
 
201
  final_synth_str = "\n".join(final_synth_list) if final_synth_list else "N/A"
202
 
203
+ # --- FIELD 4: Other Artifacts (Display source/artifacts logic) ---
204
  final_artifacts_str = ""
205
 
206
+ # Case: both are Fake -> show both
207
  if img_is_fake and txt_is_fake:
208
  final_artifacts_str = f"**[Image Evidence]**\n{img_data_parsed['artifacts']}\n\n**[Text Evidence]**\n{txt_res.get_other_artifacts()}"
209
 
210
+ # Case: only Image is Fake -> show image evidence
211
  elif img_is_fake:
212
  final_artifacts_str = f"{img_data_parsed['artifacts']}"
213
 
214
+ # Case: only Text is Fake -> show text evidence
215
  elif txt_is_fake:
216
  final_artifacts_str = f"{txt_res.get_other_artifacts()}"
217
 
218
+ # Case: both are REAL -> show source if available
219
  else:
220
  final_artifacts_str = "Both image and text are verified as authentic by our multi-modal pipeline."
221
 
222
+ # Check image source (non-empty and not N/A)
223
  img_src = img_data_parsed.get('artifacts', '').strip()
224
  if img_src and img_src != "N/A" and "No details" not in img_src:
225
  final_artifacts_str += f"\n\n**For Image:** {img_src}"
226
 
227
+ # Check text source
228
  txt_src = txt_res.get_other_artifacts().strip()
229
  if txt_src and txt_src != "N/A":
230
  final_artifacts_str += f"\n\n**For Text:** {txt_src}"
231
 
232
+ # BUILD FINAL MARKDOWN
233
  final_report_md = f"""
234
  ### 📋 Final Verification Report
235
 
miragenews/test_single_pair.py CHANGED
@@ -107,19 +107,19 @@ if __name__ == "__main__":
107
  else:
108
  print(f"\nFailed to process {input_pt_path_single}.")
109
 
110
- print("\n" + "="*50 + "\n") # Thêm dòng phân cách
111
 
112
- # --- DỤ XỬ NHIỀU FILE ---
113
  pt_files_to_check = [
114
- "encodings/predictions/image/merged/my_single_image_dir/real.pt", # Thay bằng đường dẫn file thật
115
- # "encodings/predictions/image/merged/another_dir/fake_image.pt", # ĐÃ XÓA DÒNG NÀY
116
- "path/to/nonexistent.pt" # dụ file không tồn tại
117
  ]
118
  print("\n--- Processing multiple files ---")
119
  results = {}
120
  for file_path in pt_files_to_check:
121
  prob_fake, label = predict_authenticity_from_pt(file_path, mirage_img, device)
122
- results[file_path] = (prob_fake, label) # Lưu kết quả vào dictionary
123
 
124
  print("\n--- Summary ---")
125
  for file, (prob_fake, label) in results.items():
 
107
  else:
108
  print(f"\nFailed to process {input_pt_path_single}.")
109
 
110
+ print("\n" + "="*50 + "\n") # Add a divider line
111
 
112
+ # --- EXAMPLE: PROCESS MULTIPLE FILES ---
113
  pt_files_to_check = [
114
+ "encodings/predictions/image/merged/my_single_image_dir/real.pt", # Replace with the real file path
115
+ # "encodings/predictions/image/merged/another_dir/fake_image.pt", # THIS LINE WAS REMOVED
116
+ "path/to/nonexistent.pt" # Example of a missing file
117
  ]
118
  print("\n--- Processing multiple files ---")
119
  results = {}
120
  for file_path in pt_files_to_check:
121
  prob_fake, label = predict_authenticity_from_pt(file_path, mirage_img, device)
122
+ results[file_path] = (prob_fake, label) # Store results in a dictionary
123
 
124
  print("\n--- Summary ---")
125
  for file, (prob_fake, label) in results.items():
miragenews/text_module/config.py CHANGED
@@ -5,9 +5,9 @@ from dotenv import load_dotenv
5
  load_dotenv()
6
 
7
  # API Keys
8
- GOOGLE_API_KEY = os.getenv("GOOGLE_CSE_CX") # Dùng cho Gemini (theo code của bạn)
9
  GOOGLE_SAFE_BROWSING_API_KEY = os.getenv("GOOGLE_SAFE_BROWSING_API_KEY")
10
- GOOGLE_CX = os.getenv("GOOGLE_CSE_CX") # Dùng cho Search
11
  SCRAPINGDOG_API_KEY = os.getenv("SCRAPINGDOG_API_KEY")
12
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
13
 
 
5
  load_dotenv()
6
 
7
  # API Keys
8
+ GOOGLE_API_KEY = os.getenv("GOOGLE_CSE_CX") # Used for Gemini (per your previous code)
9
  GOOGLE_SAFE_BROWSING_API_KEY = os.getenv("GOOGLE_SAFE_BROWSING_API_KEY")
10
+ GOOGLE_CX = os.getenv("GOOGLE_CSE_CX") # Used for search
11
  SCRAPINGDOG_API_KEY = os.getenv("SCRAPINGDOG_API_KEY")
12
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
13
 
miragenews/text_module/llm_utils.py CHANGED
@@ -61,7 +61,7 @@ def ask_llm_to_rewrite(text_content):
61
  max_output_tokens=MAX_TOKENS
62
  )
63
 
64
- # 4. Gọi hàm generate_content
65
  response = flash_model.generate_content(
66
  full_prompt,
67
  generation_config=config
 
61
  max_output_tokens=MAX_TOKENS
62
  )
63
 
64
+ # 4. Call generate_content
65
  response = flash_model.generate_content(
66
  full_prompt,
67
  generation_config=config