Spaces:

ar07xd
/

deepshield

Running

App Files Files Community

ar07xd commited on 14 days ago

Commit

f51c5bd

verified ·

1 Parent(s): b8a9970

Sync from GitHub (Code Only)

Browse files

Files changed (4) hide show

.gitattributes +1 -0
services/report_service.py +32 -9
services/text_service.py +27 -7
static/logo.png +3 -0

.gitattributes CHANGED Viewed

@@ -65,3 +65,4 @@ models/icpr2020dfdc/test/data/ffpp/original_sequences/youtube/c23/videos/750.mp4
 media/donald-trump-gettyimages-687193180.jpg filter=lfs diff=lfs merge=lfs -text
 trained_models/deepfake_densenet121_high_acc.keras filter=lfs diff=lfs merge=lfs -text
 trained_models/deepfake_densenet121_latest.keras filter=lfs diff=lfs merge=lfs -text

 media/donald-trump-gettyimages-687193180.jpg filter=lfs diff=lfs merge=lfs -text
 trained_models/deepfake_densenet121_high_acc.keras filter=lfs diff=lfs merge=lfs -text
 trained_models/deepfake_densenet121_latest.keras filter=lfs diff=lfs merge=lfs -text
+static/logo.png filter=lfs diff=lfs merge=lfs -text

services/report_service.py CHANGED Viewed

@@ -33,7 +33,7 @@ from db.models import AnalysisRecord, Report
 REPO_ROOT = Path(__file__).resolve().parents[2]
 BACKEND_ROOT = Path(__file__).resolve().parents[1]
-LOGO_PATH = REPO_ROOT / "frontend" / "src" / "assets" / "logo.png"
 IST = ZoneInfo("Asia/Kolkata")
 # Typography & Spacing Grid (base unit: 6pt)
@@ -228,6 +228,28 @@ def _image_from_base64(data: Any, max_width: float, max_height: float) -> Image
         return None
 def _image_from_path(path: Path | None, max_width: float, max_height: float) -> Image | None:
     """Load image from path, embed as bytes in PDF, or return None with error logging."""
     if path is None:
@@ -582,15 +604,15 @@ def _media_context(analysis_json: dict[str, Any], record: AnalysisRecord, styles
         return story
     if media_type in {"image", "screenshot", "video"}:
-        thumb = _image_from_path(
-            _resolve_media_path(analysis_json.get("thumbnail_url") or record.thumbnail_url),
-            72 * mm,
-            48 * mm,
         )
-        original = _image_from_path(
-            _resolve_media_path(analysis_json.get("media_path") or record.media_path),
-            72 * mm,
-            48 * mm,
         )
         image_cell: Any = thumb or original or Paragraph("Original thumbnail unavailable", styles["small"])
         text_value, was_truncated = _shorten(expl.get("extracted_text") or expl.get("transcript"), 800)
@@ -804,6 +826,7 @@ def _forensic_visuals(analysis_json: dict[str, Any], styles: dict[str, Paragraph
     for title, caption, b64_data, url_data in candidates:
         img = (
             _image_from_base64(b64_data, 78 * mm, 58 * mm)
             or _image_from_path(_resolve_media_path(url_data), 78 * mm, 58 * mm)
             or _placeholder_image(78 * mm, 58 * mm)
         )

 REPO_ROOT = Path(__file__).resolve().parents[2]
 BACKEND_ROOT = Path(__file__).resolve().parents[1]
+LOGO_PATH = BACKEND_ROOT / "static" / "logo.png"
 IST = ZoneInfo("Asia/Kolkata")
 # Typography & Spacing Grid (base unit: 6pt)
         return None
+import urllib.request
+def _image_from_url(url: str | None, max_width: float, max_height: float) -> Image | None:
+    """Download image from HTTP/HTTPS URL and embed in PDF."""
+    if not url or not str(url).startswith("http"):
+        return None
+    try:
+        req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
+        with urllib.request.urlopen(req, timeout=10) as response:
+            image_bytes = response.read()
+        with PILImage.open(BytesIO(image_bytes)) as pil:
+            width, height = pil.size
+        stream = BytesIO(image_bytes)
+        stream.seek(0)
+        return _scaled_image(stream, width, height, max_width, max_height)
+    except Exception as exc:
+        logger.warning(f"Failed to fetch image from URL {url}: {exc}")
+        return None
 def _image_from_path(path: Path | None, max_width: float, max_height: float) -> Image | None:
     """Load image from path, embed as bytes in PDF, or return None with error logging."""
     if path is None:
         return story
     if media_type in {"image", "screenshot", "video"}:
+        thumb_url = analysis_json.get("thumbnail_url") or record.thumbnail_url
+        thumb = (
+            _image_from_url(thumb_url, 72 * mm, 48 * mm)
+            or _image_from_path(_resolve_media_path(thumb_url), 72 * mm, 48 * mm)
         )
+        media_url = analysis_json.get("media_path") or record.media_path
+        original = (
+            _image_from_url(media_url, 72 * mm, 48 * mm)
+            or _image_from_path(_resolve_media_path(media_url), 72 * mm, 48 * mm)
         )
         image_cell: Any = thumb or original or Paragraph("Original thumbnail unavailable", styles["small"])
         text_value, was_truncated = _shorten(expl.get("extracted_text") or expl.get("transcript"), 800)
     for title, caption, b64_data, url_data in candidates:
         img = (
             _image_from_base64(b64_data, 78 * mm, 58 * mm)
+            or _image_from_url(url_data, 78 * mm, 58 * mm)
             or _image_from_path(_resolve_media_path(url_data), 78 * mm, 58 * mm)
             or _placeholder_image(78 * mm, 58 * mm)
         )

services/text_service.py CHANGED Viewed

@@ -226,6 +226,20 @@ def extract_entities(text: str, max_k: int = 6) -> List[str]:
         seen: set[str] = set()
         numeric: List[str] = []
         for ent in doc.ents:
             norm = ent.text.strip()
             norm_lower = norm.lower()
@@ -241,13 +255,16 @@ def extract_entities(text: str, max_k: int = 6) -> List[str]:
                 other.append(norm)
         entities = preferred + numeric + other
-        if len(entities) >= 2:
-            logger.info(f"NER extracted {len(entities)} entities: {entities[:max_k]}")
-            return entities[:max_k]
-        freq_kws = _extract_keywords_freq(text, max_k)
-        combined = entities + [k for k in freq_kws if k.lower() not in seen]
-        return combined[:max_k]
     except Exception as e:
         logger.warning(f"spaCy NER failed: {e} - falling back to frequency extraction")
         return _extract_keywords_freq(text, max_k)
@@ -259,6 +276,9 @@ def _extract_keywords_freq(text: str, max_k: int = 6) -> List[str]:
         "in", "on", "at", "for", "with", "by", "from", "as", "that", "this", "it", "its", "has", "have", "had",
         "will", "would", "can", "could", "should", "may", "might", "do", "does", "did", "not", "no", "so",
         "than", "then", "there", "their", "they", "them", "we", "our", "you", "your", "he", "she", "his", "her",
     }
     words = re.findall(r"[A-Za-z][A-Za-z\-']{2,}|\b\d{1,5}\b", text or "")
     freq: dict[str, int] = {}

         seen: set[str] = set()
         numeric: List[str] = []
+        # Extract meaningful multi-word noun chunks first
+        for chunk in doc.noun_chunks:
+            parts = chunk.text.strip().split()
+            if len(parts) > 1 and parts[0].lower() in {"a", "an", "the", "some", "several", "many", "these", "those", "this", "that", "their", "our", "my", "your", "its"}:
+                parts = parts[1:]
+            chunk_text = " ".join(parts)
+            if len(parts) > 1 and len(chunk_text) > 4:
+                if not all(p.lower() in {"i", "you", "he", "she", "it", "we", "they", "them", "us", "him", "her"} for p in parts):
+                    norm_lower = chunk_text.lower()
+                    if norm_lower not in seen:
+                        preferred.append(chunk_text)
+                        seen.add(norm_lower)
         for ent in doc.ents:
             norm = ent.text.strip()
             norm_lower = norm.lower()
                 other.append(norm)
         entities = preferred + numeric + other
+        if len(entities) < max_k:
+            freq_kws = _extract_keywords_freq(text, max_k * 2)
+            for k in freq_kws:
+                if k.lower() not in seen:
+                    entities.append(k)
+                    seen.add(k.lower())
+        result = entities[:max_k]
+        logger.info(f"NER extracted {len(result)} entities: {result}")
+        return result
     except Exception as e:
         logger.warning(f"spaCy NER failed: {e} - falling back to frequency extraction")
         return _extract_keywords_freq(text, max_k)
         "in", "on", "at", "for", "with", "by", "from", "as", "that", "this", "it", "its", "has", "have", "had",
         "will", "would", "can", "could", "should", "may", "might", "do", "does", "did", "not", "no", "so",
         "than", "then", "there", "their", "they", "them", "we", "our", "you", "your", "he", "she", "his", "her",
+        "during", "several", "also", "about", "which", "who", "whom", "what", "where", "when", "why", "how",
+        "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "only", "own", "same", "very",
+        "these", "those", "into", "through", "after", "before", "over", "under", "between", "out", "against", "during"
     }
     words = re.findall(r"[A-Za-z][A-Za-z\-']{2,}|\b\d{1,5}\b", text or "")
     freq: dict[str, int] = {}

static/logo.png ADDED Viewed

Git LFS Details

SHA256: a63a7723a2a99c03ae7a748a07898ad10a3e90ac3fcd1d6059fb29f6b7c2e9b0
Pointer size: 131 Bytes
Size of remote file: 431 kB