Spaces:
Running
Running
Sync from GitHub (Code Only)
Browse files- .gitattributes +1 -0
- services/report_service.py +32 -9
- services/text_service.py +27 -7
- static/logo.png +3 -0
.gitattributes
CHANGED
|
@@ -65,3 +65,4 @@ models/icpr2020dfdc/test/data/ffpp/original_sequences/youtube/c23/videos/750.mp4
|
|
| 65 |
media/donald-trump-gettyimages-687193180.jpg filter=lfs diff=lfs merge=lfs -text
|
| 66 |
trained_models/deepfake_densenet121_high_acc.keras filter=lfs diff=lfs merge=lfs -text
|
| 67 |
trained_models/deepfake_densenet121_latest.keras filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 65 |
media/donald-trump-gettyimages-687193180.jpg filter=lfs diff=lfs merge=lfs -text
|
| 66 |
trained_models/deepfake_densenet121_high_acc.keras filter=lfs diff=lfs merge=lfs -text
|
| 67 |
trained_models/deepfake_densenet121_latest.keras filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
static/logo.png filter=lfs diff=lfs merge=lfs -text
|
services/report_service.py
CHANGED
|
@@ -33,7 +33,7 @@ from db.models import AnalysisRecord, Report
|
|
| 33 |
|
| 34 |
REPO_ROOT = Path(__file__).resolve().parents[2]
|
| 35 |
BACKEND_ROOT = Path(__file__).resolve().parents[1]
|
| 36 |
-
LOGO_PATH =
|
| 37 |
IST = ZoneInfo("Asia/Kolkata")
|
| 38 |
|
| 39 |
# Typography & Spacing Grid (base unit: 6pt)
|
|
@@ -228,6 +228,28 @@ def _image_from_base64(data: Any, max_width: float, max_height: float) -> Image
|
|
| 228 |
return None
|
| 229 |
|
| 230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
def _image_from_path(path: Path | None, max_width: float, max_height: float) -> Image | None:
|
| 232 |
"""Load image from path, embed as bytes in PDF, or return None with error logging."""
|
| 233 |
if path is None:
|
|
@@ -582,15 +604,15 @@ def _media_context(analysis_json: dict[str, Any], record: AnalysisRecord, styles
|
|
| 582 |
return story
|
| 583 |
|
| 584 |
if media_type in {"image", "screenshot", "video"}:
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
72 * mm,
|
| 588 |
-
|
| 589 |
)
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
72 * mm,
|
| 593 |
-
|
| 594 |
)
|
| 595 |
image_cell: Any = thumb or original or Paragraph("Original thumbnail unavailable", styles["small"])
|
| 596 |
text_value, was_truncated = _shorten(expl.get("extracted_text") or expl.get("transcript"), 800)
|
|
@@ -804,6 +826,7 @@ def _forensic_visuals(analysis_json: dict[str, Any], styles: dict[str, Paragraph
|
|
| 804 |
for title, caption, b64_data, url_data in candidates:
|
| 805 |
img = (
|
| 806 |
_image_from_base64(b64_data, 78 * mm, 58 * mm)
|
|
|
|
| 807 |
or _image_from_path(_resolve_media_path(url_data), 78 * mm, 58 * mm)
|
| 808 |
or _placeholder_image(78 * mm, 58 * mm)
|
| 809 |
)
|
|
|
|
| 33 |
|
| 34 |
REPO_ROOT = Path(__file__).resolve().parents[2]
|
| 35 |
BACKEND_ROOT = Path(__file__).resolve().parents[1]
|
| 36 |
+
LOGO_PATH = BACKEND_ROOT / "static" / "logo.png"
|
| 37 |
IST = ZoneInfo("Asia/Kolkata")
|
| 38 |
|
| 39 |
# Typography & Spacing Grid (base unit: 6pt)
|
|
|
|
| 228 |
return None
|
| 229 |
|
| 230 |
|
| 231 |
+
import urllib.request
|
| 232 |
+
|
| 233 |
+
def _image_from_url(url: str | None, max_width: float, max_height: float) -> Image | None:
|
| 234 |
+
"""Download image from HTTP/HTTPS URL and embed in PDF."""
|
| 235 |
+
if not url or not str(url).startswith("http"):
|
| 236 |
+
return None
|
| 237 |
+
try:
|
| 238 |
+
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
| 239 |
+
with urllib.request.urlopen(req, timeout=10) as response:
|
| 240 |
+
image_bytes = response.read()
|
| 241 |
+
|
| 242 |
+
with PILImage.open(BytesIO(image_bytes)) as pil:
|
| 243 |
+
width, height = pil.size
|
| 244 |
+
|
| 245 |
+
stream = BytesIO(image_bytes)
|
| 246 |
+
stream.seek(0)
|
| 247 |
+
return _scaled_image(stream, width, height, max_width, max_height)
|
| 248 |
+
except Exception as exc:
|
| 249 |
+
logger.warning(f"Failed to fetch image from URL {url}: {exc}")
|
| 250 |
+
return None
|
| 251 |
+
|
| 252 |
+
|
| 253 |
def _image_from_path(path: Path | None, max_width: float, max_height: float) -> Image | None:
|
| 254 |
"""Load image from path, embed as bytes in PDF, or return None with error logging."""
|
| 255 |
if path is None:
|
|
|
|
| 604 |
return story
|
| 605 |
|
| 606 |
if media_type in {"image", "screenshot", "video"}:
|
| 607 |
+
thumb_url = analysis_json.get("thumbnail_url") or record.thumbnail_url
|
| 608 |
+
thumb = (
|
| 609 |
+
_image_from_url(thumb_url, 72 * mm, 48 * mm)
|
| 610 |
+
or _image_from_path(_resolve_media_path(thumb_url), 72 * mm, 48 * mm)
|
| 611 |
)
|
| 612 |
+
media_url = analysis_json.get("media_path") or record.media_path
|
| 613 |
+
original = (
|
| 614 |
+
_image_from_url(media_url, 72 * mm, 48 * mm)
|
| 615 |
+
or _image_from_path(_resolve_media_path(media_url), 72 * mm, 48 * mm)
|
| 616 |
)
|
| 617 |
image_cell: Any = thumb or original or Paragraph("Original thumbnail unavailable", styles["small"])
|
| 618 |
text_value, was_truncated = _shorten(expl.get("extracted_text") or expl.get("transcript"), 800)
|
|
|
|
| 826 |
for title, caption, b64_data, url_data in candidates:
|
| 827 |
img = (
|
| 828 |
_image_from_base64(b64_data, 78 * mm, 58 * mm)
|
| 829 |
+
or _image_from_url(url_data, 78 * mm, 58 * mm)
|
| 830 |
or _image_from_path(_resolve_media_path(url_data), 78 * mm, 58 * mm)
|
| 831 |
or _placeholder_image(78 * mm, 58 * mm)
|
| 832 |
)
|
services/text_service.py
CHANGED
|
@@ -226,6 +226,20 @@ def extract_entities(text: str, max_k: int = 6) -> List[str]:
|
|
| 226 |
seen: set[str] = set()
|
| 227 |
|
| 228 |
numeric: List[str] = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
for ent in doc.ents:
|
| 230 |
norm = ent.text.strip()
|
| 231 |
norm_lower = norm.lower()
|
|
@@ -241,13 +255,16 @@ def extract_entities(text: str, max_k: int = 6) -> List[str]:
|
|
| 241 |
other.append(norm)
|
| 242 |
|
| 243 |
entities = preferred + numeric + other
|
| 244 |
-
if len(entities)
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
| 251 |
except Exception as e:
|
| 252 |
logger.warning(f"spaCy NER failed: {e} - falling back to frequency extraction")
|
| 253 |
return _extract_keywords_freq(text, max_k)
|
|
@@ -259,6 +276,9 @@ def _extract_keywords_freq(text: str, max_k: int = 6) -> List[str]:
|
|
| 259 |
"in", "on", "at", "for", "with", "by", "from", "as", "that", "this", "it", "its", "has", "have", "had",
|
| 260 |
"will", "would", "can", "could", "should", "may", "might", "do", "does", "did", "not", "no", "so",
|
| 261 |
"than", "then", "there", "their", "they", "them", "we", "our", "you", "your", "he", "she", "his", "her",
|
|
|
|
|
|
|
|
|
|
| 262 |
}
|
| 263 |
words = re.findall(r"[A-Za-z][A-Za-z\-']{2,}|\b\d{1,5}\b", text or "")
|
| 264 |
freq: dict[str, int] = {}
|
|
|
|
| 226 |
seen: set[str] = set()
|
| 227 |
|
| 228 |
numeric: List[str] = []
|
| 229 |
+
|
| 230 |
+
# Extract meaningful multi-word noun chunks first
|
| 231 |
+
for chunk in doc.noun_chunks:
|
| 232 |
+
parts = chunk.text.strip().split()
|
| 233 |
+
if len(parts) > 1 and parts[0].lower() in {"a", "an", "the", "some", "several", "many", "these", "those", "this", "that", "their", "our", "my", "your", "its"}:
|
| 234 |
+
parts = parts[1:]
|
| 235 |
+
chunk_text = " ".join(parts)
|
| 236 |
+
if len(parts) > 1 and len(chunk_text) > 4:
|
| 237 |
+
if not all(p.lower() in {"i", "you", "he", "she", "it", "we", "they", "them", "us", "him", "her"} for p in parts):
|
| 238 |
+
norm_lower = chunk_text.lower()
|
| 239 |
+
if norm_lower not in seen:
|
| 240 |
+
preferred.append(chunk_text)
|
| 241 |
+
seen.add(norm_lower)
|
| 242 |
+
|
| 243 |
for ent in doc.ents:
|
| 244 |
norm = ent.text.strip()
|
| 245 |
norm_lower = norm.lower()
|
|
|
|
| 255 |
other.append(norm)
|
| 256 |
|
| 257 |
entities = preferred + numeric + other
|
| 258 |
+
if len(entities) < max_k:
|
| 259 |
+
freq_kws = _extract_keywords_freq(text, max_k * 2)
|
| 260 |
+
for k in freq_kws:
|
| 261 |
+
if k.lower() not in seen:
|
| 262 |
+
entities.append(k)
|
| 263 |
+
seen.add(k.lower())
|
| 264 |
+
|
| 265 |
+
result = entities[:max_k]
|
| 266 |
+
logger.info(f"NER extracted {len(result)} entities: {result}")
|
| 267 |
+
return result
|
| 268 |
except Exception as e:
|
| 269 |
logger.warning(f"spaCy NER failed: {e} - falling back to frequency extraction")
|
| 270 |
return _extract_keywords_freq(text, max_k)
|
|
|
|
| 276 |
"in", "on", "at", "for", "with", "by", "from", "as", "that", "this", "it", "its", "has", "have", "had",
|
| 277 |
"will", "would", "can", "could", "should", "may", "might", "do", "does", "did", "not", "no", "so",
|
| 278 |
"than", "then", "there", "their", "they", "them", "we", "our", "you", "your", "he", "she", "his", "her",
|
| 279 |
+
"during", "several", "also", "about", "which", "who", "whom", "what", "where", "when", "why", "how",
|
| 280 |
+
"all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "only", "own", "same", "very",
|
| 281 |
+
"these", "those", "into", "through", "after", "before", "over", "under", "between", "out", "against", "during"
|
| 282 |
}
|
| 283 |
words = re.findall(r"[A-Za-z][A-Za-z\-']{2,}|\b\d{1,5}\b", text or "")
|
| 284 |
freq: dict[str, int] = {}
|
static/logo.png
ADDED
|
Git LFS Details
|