ziadsameh32 commited on
Commit
4c47447
·
1 Parent(s): eec4f12

Initial FastAPI CrewAI setup

Browse files
Files changed (3) hide show
  1. rag/ingest_net.py +2 -1
  2. rag/pdf_text.py +14 -9
  3. requirements.txt +1 -0
rag/ingest_net.py CHANGED
@@ -49,11 +49,12 @@ def ingest_from_net(user_id: str, book_id: str, sources: List[Dict[str, Any]]):
49
  # ---------- Extract text / OCR ----------
50
  from .pdf_text import extract_text_pypdf2, is_text_usable
51
  from .ocr import mistral_ocr_pdf
52
-
53
  pages = extract_text_pypdf2(pdf_bytes)
54
 
55
  joined = "\n".join(pages)
56
  if is_text_usable(joined):
 
57
  extraction_method = "text"
58
  else:
59
  if len(pdf_bytes) > 10 * 1024 * 1024:
 
49
  # ---------- Extract text / OCR ----------
50
  from .pdf_text import extract_text_pypdf2, is_text_usable
51
  from .ocr import mistral_ocr_pdf
52
+
53
  pages = extract_text_pypdf2(pdf_bytes)
54
 
55
  joined = "\n".join(pages)
56
  if is_text_usable(joined):
57
+ print(f"Done pypdf2✅ | pages={len(pages)}")
58
  extraction_method = "text"
59
  else:
60
  if len(pdf_bytes) > 10 * 1024 * 1024:
rag/pdf_text.py CHANGED
@@ -3,16 +3,24 @@ from io import BytesIO
3
  import re
4
 
5
  from PyPDF2 import PdfReader
 
6
 
7
 
8
  def extract_text_pypdf2(pdf_bytes: bytes) -> List[str]:
9
- """
10
- Extract text per page using PyPDF2.
11
- Returns: List[str] where each item = one page text
12
- """
13
  pages: List[str] = []
14
 
15
- reader = PdfReader(BytesIO(pdf_bytes))
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  for page in reader.pages:
18
  try:
@@ -20,13 +28,10 @@ def extract_text_pypdf2(pdf_bytes: bytes) -> List[str]:
20
  except Exception:
21
  text = ""
22
 
23
- # Normalize whitespace (مهم للتشانكينج)
24
  text = re.sub(r"\s+\n", "\n", text)
25
  text = re.sub(r"\n\s+", "\n", text)
26
  text = re.sub(r"[ \t]+", " ", text)
27
- text = text.strip()
28
-
29
- pages.append(text)
30
 
31
  return pages
32
 
 
3
  import re
4
 
5
  from PyPDF2 import PdfReader
6
+ from PyPDF2.errors import DependencyError
7
 
8
 
9
  def extract_text_pypdf2(pdf_bytes: bytes) -> List[str]:
 
 
 
 
10
  pages: List[str] = []
11
 
12
+ try:
13
+ reader = PdfReader(BytesIO(pdf_bytes))
14
+ except DependencyError:
15
+ # AES encryption without pycryptodome
16
+ return []
17
+
18
+ # لو PDF مشفر
19
+ if reader.is_encrypted:
20
+ try:
21
+ reader.decrypt("") # حاول password فاضي
22
+ except Exception:
23
+ return []
24
 
25
  for page in reader.pages:
26
  try:
 
28
  except Exception:
29
  text = ""
30
 
 
31
  text = re.sub(r"\s+\n", "\n", text)
32
  text = re.sub(r"\n\s+", "\n", text)
33
  text = re.sub(r"[ \t]+", " ", text)
34
+ pages.append(text.strip())
 
 
35
 
36
  return pages
37
 
requirements.txt CHANGED
@@ -25,3 +25,4 @@ rapidfuzz
25
  supabase
26
  crawlee==0.3.6
27
  browserforge==1.1.2
 
 
25
  supabase
26
  crawlee==0.3.6
27
  browserforge==1.1.2
28
+ pycryptodome