ar07xd commited on
Commit
f51c5bd
·
verified ·
1 Parent(s): b8a9970

Sync from GitHub (Code Only)

Browse files
.gitattributes CHANGED
@@ -65,3 +65,4 @@ models/icpr2020dfdc/test/data/ffpp/original_sequences/youtube/c23/videos/750.mp4
65
  media/donald-trump-gettyimages-687193180.jpg filter=lfs diff=lfs merge=lfs -text
66
  trained_models/deepfake_densenet121_high_acc.keras filter=lfs diff=lfs merge=lfs -text
67
  trained_models/deepfake_densenet121_latest.keras filter=lfs diff=lfs merge=lfs -text
 
 
65
  media/donald-trump-gettyimages-687193180.jpg filter=lfs diff=lfs merge=lfs -text
66
  trained_models/deepfake_densenet121_high_acc.keras filter=lfs diff=lfs merge=lfs -text
67
  trained_models/deepfake_densenet121_latest.keras filter=lfs diff=lfs merge=lfs -text
68
+ static/logo.png filter=lfs diff=lfs merge=lfs -text
services/report_service.py CHANGED
@@ -33,7 +33,7 @@ from db.models import AnalysisRecord, Report
33
 
34
  REPO_ROOT = Path(__file__).resolve().parents[2]
35
  BACKEND_ROOT = Path(__file__).resolve().parents[1]
36
- LOGO_PATH = REPO_ROOT / "frontend" / "src" / "assets" / "logo.png"
37
  IST = ZoneInfo("Asia/Kolkata")
38
 
39
  # Typography & Spacing Grid (base unit: 6pt)
@@ -228,6 +228,28 @@ def _image_from_base64(data: Any, max_width: float, max_height: float) -> Image
228
  return None
229
 
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  def _image_from_path(path: Path | None, max_width: float, max_height: float) -> Image | None:
232
  """Load image from path, embed as bytes in PDF, or return None with error logging."""
233
  if path is None:
@@ -582,15 +604,15 @@ def _media_context(analysis_json: dict[str, Any], record: AnalysisRecord, styles
582
  return story
583
 
584
  if media_type in {"image", "screenshot", "video"}:
585
- thumb = _image_from_path(
586
- _resolve_media_path(analysis_json.get("thumbnail_url") or record.thumbnail_url),
587
- 72 * mm,
588
- 48 * mm,
589
  )
590
- original = _image_from_path(
591
- _resolve_media_path(analysis_json.get("media_path") or record.media_path),
592
- 72 * mm,
593
- 48 * mm,
594
  )
595
  image_cell: Any = thumb or original or Paragraph("Original thumbnail unavailable", styles["small"])
596
  text_value, was_truncated = _shorten(expl.get("extracted_text") or expl.get("transcript"), 800)
@@ -804,6 +826,7 @@ def _forensic_visuals(analysis_json: dict[str, Any], styles: dict[str, Paragraph
804
  for title, caption, b64_data, url_data in candidates:
805
  img = (
806
  _image_from_base64(b64_data, 78 * mm, 58 * mm)
 
807
  or _image_from_path(_resolve_media_path(url_data), 78 * mm, 58 * mm)
808
  or _placeholder_image(78 * mm, 58 * mm)
809
  )
 
33
 
34
  REPO_ROOT = Path(__file__).resolve().parents[2]
35
  BACKEND_ROOT = Path(__file__).resolve().parents[1]
36
+ LOGO_PATH = BACKEND_ROOT / "static" / "logo.png"
37
  IST = ZoneInfo("Asia/Kolkata")
38
 
39
  # Typography & Spacing Grid (base unit: 6pt)
 
228
  return None
229
 
230
 
231
+ import urllib.request
232
+
233
+ def _image_from_url(url: str | None, max_width: float, max_height: float) -> Image | None:
234
+ """Download image from HTTP/HTTPS URL and embed in PDF."""
235
+ if not url or not str(url).startswith("http"):
236
+ return None
237
+ try:
238
+ req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
239
+ with urllib.request.urlopen(req, timeout=10) as response:
240
+ image_bytes = response.read()
241
+
242
+ with PILImage.open(BytesIO(image_bytes)) as pil:
243
+ width, height = pil.size
244
+
245
+ stream = BytesIO(image_bytes)
246
+ stream.seek(0)
247
+ return _scaled_image(stream, width, height, max_width, max_height)
248
+ except Exception as exc:
249
+ logger.warning(f"Failed to fetch image from URL {url}: {exc}")
250
+ return None
251
+
252
+
253
  def _image_from_path(path: Path | None, max_width: float, max_height: float) -> Image | None:
254
  """Load image from path, embed as bytes in PDF, or return None with error logging."""
255
  if path is None:
 
604
  return story
605
 
606
  if media_type in {"image", "screenshot", "video"}:
607
+ thumb_url = analysis_json.get("thumbnail_url") or record.thumbnail_url
608
+ thumb = (
609
+ _image_from_url(thumb_url, 72 * mm, 48 * mm)
610
+ or _image_from_path(_resolve_media_path(thumb_url), 72 * mm, 48 * mm)
611
  )
612
+ media_url = analysis_json.get("media_path") or record.media_path
613
+ original = (
614
+ _image_from_url(media_url, 72 * mm, 48 * mm)
615
+ or _image_from_path(_resolve_media_path(media_url), 72 * mm, 48 * mm)
616
  )
617
  image_cell: Any = thumb or original or Paragraph("Original thumbnail unavailable", styles["small"])
618
  text_value, was_truncated = _shorten(expl.get("extracted_text") or expl.get("transcript"), 800)
 
826
  for title, caption, b64_data, url_data in candidates:
827
  img = (
828
  _image_from_base64(b64_data, 78 * mm, 58 * mm)
829
+ or _image_from_url(url_data, 78 * mm, 58 * mm)
830
  or _image_from_path(_resolve_media_path(url_data), 78 * mm, 58 * mm)
831
  or _placeholder_image(78 * mm, 58 * mm)
832
  )
services/text_service.py CHANGED
@@ -226,6 +226,20 @@ def extract_entities(text: str, max_k: int = 6) -> List[str]:
226
  seen: set[str] = set()
227
 
228
  numeric: List[str] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  for ent in doc.ents:
230
  norm = ent.text.strip()
231
  norm_lower = norm.lower()
@@ -241,13 +255,16 @@ def extract_entities(text: str, max_k: int = 6) -> List[str]:
241
  other.append(norm)
242
 
243
  entities = preferred + numeric + other
244
- if len(entities) >= 2:
245
- logger.info(f"NER extracted {len(entities)} entities: {entities[:max_k]}")
246
- return entities[:max_k]
247
-
248
- freq_kws = _extract_keywords_freq(text, max_k)
249
- combined = entities + [k for k in freq_kws if k.lower() not in seen]
250
- return combined[:max_k]
 
 
 
251
  except Exception as e:
252
  logger.warning(f"spaCy NER failed: {e} - falling back to frequency extraction")
253
  return _extract_keywords_freq(text, max_k)
@@ -259,6 +276,9 @@ def _extract_keywords_freq(text: str, max_k: int = 6) -> List[str]:
259
  "in", "on", "at", "for", "with", "by", "from", "as", "that", "this", "it", "its", "has", "have", "had",
260
  "will", "would", "can", "could", "should", "may", "might", "do", "does", "did", "not", "no", "so",
261
  "than", "then", "there", "their", "they", "them", "we", "our", "you", "your", "he", "she", "his", "her",
 
 
 
262
  }
263
  words = re.findall(r"[A-Za-z][A-Za-z\-']{2,}|\b\d{1,5}\b", text or "")
264
  freq: dict[str, int] = {}
 
226
  seen: set[str] = set()
227
 
228
  numeric: List[str] = []
229
+
230
+ # Extract meaningful multi-word noun chunks first
231
+ for chunk in doc.noun_chunks:
232
+ parts = chunk.text.strip().split()
233
+ if len(parts) > 1 and parts[0].lower() in {"a", "an", "the", "some", "several", "many", "these", "those", "this", "that", "their", "our", "my", "your", "its"}:
234
+ parts = parts[1:]
235
+ chunk_text = " ".join(parts)
236
+ if len(parts) > 1 and len(chunk_text) > 4:
237
+ if not all(p.lower() in {"i", "you", "he", "she", "it", "we", "they", "them", "us", "him", "her"} for p in parts):
238
+ norm_lower = chunk_text.lower()
239
+ if norm_lower not in seen:
240
+ preferred.append(chunk_text)
241
+ seen.add(norm_lower)
242
+
243
  for ent in doc.ents:
244
  norm = ent.text.strip()
245
  norm_lower = norm.lower()
 
255
  other.append(norm)
256
 
257
  entities = preferred + numeric + other
258
+ if len(entities) < max_k:
259
+ freq_kws = _extract_keywords_freq(text, max_k * 2)
260
+ for k in freq_kws:
261
+ if k.lower() not in seen:
262
+ entities.append(k)
263
+ seen.add(k.lower())
264
+
265
+ result = entities[:max_k]
266
+ logger.info(f"NER extracted {len(result)} entities: {result}")
267
+ return result
268
  except Exception as e:
269
  logger.warning(f"spaCy NER failed: {e} - falling back to frequency extraction")
270
  return _extract_keywords_freq(text, max_k)
 
276
  "in", "on", "at", "for", "with", "by", "from", "as", "that", "this", "it", "its", "has", "have", "had",
277
  "will", "would", "can", "could", "should", "may", "might", "do", "does", "did", "not", "no", "so",
278
  "than", "then", "there", "their", "they", "them", "we", "our", "you", "your", "he", "she", "his", "her",
279
+ "during", "several", "also", "about", "which", "who", "whom", "what", "where", "when", "why", "how",
280
+ "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "only", "own", "same", "very",
281
+ "these", "those", "into", "through", "after", "before", "over", "under", "between", "out", "against", "during"
282
  }
283
  words = re.findall(r"[A-Za-z][A-Za-z\-']{2,}|\b\d{1,5}\b", text or "")
284
  freq: dict[str, int] = {}
static/logo.png ADDED

Git LFS Details

  • SHA256: a63a7723a2a99c03ae7a748a07898ad10a3e90ac3fcd1d6059fb29f6b7c2e9b0
  • Pointer size: 131 Bytes
  • Size of remote file: 431 kB