Spaces:

ziadsameh32
/

ContiAI-v4

Sleeping

ziadsameh32 commited on Feb 5

Commit

4c47447

1 Parent(s): eec4f12

Initial FastAPI CrewAI setup

Files changed (3) hide show

rag/ingest_net.py CHANGED Viewed

@@ -49,11 +49,12 @@ def ingest_from_net(user_id: str, book_id: str, sources: List[Dict[str, Any]]):
         # ---------- Extract text / OCR ----------
         from .pdf_text import extract_text_pypdf2, is_text_usable
         from .ocr import mistral_ocr_pdf
         pages = extract_text_pypdf2(pdf_bytes)
         joined = "\n".join(pages)
         if is_text_usable(joined):
             extraction_method = "text"
         else:
             if len(pdf_bytes) > 10 * 1024 * 1024:

         # ---------- Extract text / OCR ----------
         from .pdf_text import extract_text_pypdf2, is_text_usable
         from .ocr import mistral_ocr_pdf
         pages = extract_text_pypdf2(pdf_bytes)
         joined = "\n".join(pages)
         if is_text_usable(joined):
+            print(f"Done pypdf2✅ | pages={len(pages)}")
             extraction_method = "text"
         else:
             if len(pdf_bytes) > 10 * 1024 * 1024:

rag/pdf_text.py CHANGED Viewed

@@ -3,16 +3,24 @@ from io import BytesIO
 import re
 from PyPDF2 import PdfReader
 def extract_text_pypdf2(pdf_bytes: bytes) -> List[str]:
-    """
-    Extract text per page using PyPDF2.
-    Returns: List[str] where each item = one page text
-    """
     pages: List[str] = []
-    reader = PdfReader(BytesIO(pdf_bytes))
     for page in reader.pages:
         try:
@@ -20,13 +28,10 @@ def extract_text_pypdf2(pdf_bytes: bytes) -> List[str]:
         except Exception:
             text = ""
-        # Normalize whitespace (مهم للتشانكينج)
         text = re.sub(r"\s+\n", "\n", text)
         text = re.sub(r"\n\s+", "\n", text)
         text = re.sub(r"[ \t]+", " ", text)
-        text = text.strip()
-        pages.append(text)
     return pages

 import re
 from PyPDF2 import PdfReader
+from PyPDF2.errors import DependencyError
 def extract_text_pypdf2(pdf_bytes: bytes) -> List[str]:
     pages: List[str] = []
+    try:
+        reader = PdfReader(BytesIO(pdf_bytes))
+    except DependencyError:
+        # AES encryption without pycryptodome
+        return []
+    # لو PDF مشفر
+    if reader.is_encrypted:
+        try:
+            reader.decrypt("")  # حاول password فاضي
+        except Exception:
+            return []
     for page in reader.pages:
         try:
         except Exception:
             text = ""
         text = re.sub(r"\s+\n", "\n", text)
         text = re.sub(r"\n\s+", "\n", text)
         text = re.sub(r"[ \t]+", " ", text)
+        pages.append(text.strip())
     return pages

requirements.txt CHANGED Viewed

@@ -25,3 +25,4 @@ rapidfuzz
 supabase
 crawlee==0.3.6
 browserforge==1.1.2

 supabase
 crawlee==0.3.6
 browserforge==1.1.2
+pycryptodome