Spaces:

ziadsameh32
/

ContiAI-v4

Sleeping

App Files Files Community

ziadsameh32 commited on Mar 5

Commit

b8261f9

1 Parent(s): 7654fed

Add login page

Browse files

Files changed (3) hide show

rag/chuncking.py +55 -25
rag/ingest_net.py +141 -98
rag/rag_engine_sources.py +58 -62

rag/chuncking.py CHANGED Viewed

@@ -1,56 +1,86 @@
 import re
 from typing import List, Tuple
 def approx_token_count(text: str) -> int:
     """
-    Rough token estimation (for chunk sizing).
     """
     return max(1, len(text) // 4)
 def chunk_pages(
-    pages: List[str], target_tokens: int = 520, overlap_tokens: int = 80
 ) -> List[Tuple[str, int, int]]:
     """
-    Split pages into overlapping chunks.
-    Returns: [(chunk_text, page_start, page_end), ...]
     """
-    chunks = []
-    buffer = []
-    buffer_pages = []
     buffer_tokens = 0
-    def flush():
         nonlocal buffer, buffer_pages, buffer_tokens
-        if buffer:
-            chunks.append(("\n".join(buffer), min(buffer_pages), max(buffer_pages)))
-        buffer, buffer_pages, buffer_tokens = [], [], 0
     for page_idx, page in enumerate(pages, start=1):
         paragraphs = [p.strip() for p in re.split(r"\n\s*\n", page) if p.strip()]
         for para in paragraphs:
             t = approx_token_count(para)
             if buffer_tokens + t > target_tokens:
-                flush()
             buffer.append(para)
             buffer_pages.append(page_idx)
             buffer_tokens += t
-    flush()
-    # Overlap handling
-    if overlap_tokens > 0 and len(chunks) > 1:
-        overlapped = []
-        prev_tail = ""
-        for text, ps, pe in chunks:
-            merged = (prev_tail + "\n" + text).strip()
-            overlapped.append((merged, ps, pe))
-            prev_tail = text[-overlap_tokens * 4 :]
-        return overlapped
     return chunks

+# chunking.py
 import re
 from typing import List, Tuple
 def approx_token_count(text: str) -> int:
     """
+    Rough token estimate for chunk sizing (heuristic).
+    Keep this simple but consistent: ~4 chars per token.
     """
     return max(1, len(text) // 4)
 def chunk_pages(
+    pages: List[str],
+    target_tokens: int = 520,
+    overlap_tokens: int = 80,
 ) -> List[Tuple[str, int, int]]:
     """
+    Split pages (list[str]) into overlapping chunks.
+    Returns list of tuples: (chunk_text, page_start, page_end)
+    Overlap is implemented at paragraph level (keeps page ranges correct).
     """
+    chunks: List[Tuple[str, int, int]] = []
+    buffer: List[str] = []
+    buffer_pages: List[int] = []
     buffer_tokens = 0
+    def make_chunk():
         nonlocal buffer, buffer_pages, buffer_tokens
+        if not buffer:
+            return
+        chunk_text = "\n\n".join(buffer).strip()
+        page_start = min(buffer_pages)
+        page_end = max(buffer_pages)
+        chunks.append((chunk_text, page_start, page_end))
     for page_idx, page in enumerate(pages, start=1):
         paragraphs = [p.strip() for p in re.split(r"\n\s*\n", page) if p.strip()]
         for para in paragraphs:
             t = approx_token_count(para)
+            # If single paragraph exceeds target, create it as its own chunk
+            if buffer_tokens == 0 and t > target_tokens:
+                # make chunk with this large paragraph alone
+                buffer = [para]
+                buffer_pages = [page_idx]
+                buffer_tokens = t
+                make_chunk()
+                buffer, buffer_pages, buffer_tokens = [], [], 0
+                continue
+            # If adding this paragraph would exceed target, flush current chunk
             if buffer_tokens + t > target_tokens:
+                make_chunk()
+                # prepare overlap: keep tail paragraphs whose tokens sum >= overlap_tokens
+                tail_buffer: List[str] = []
+                tail_pages: List[int] = []
+                tail_tokens = 0
+                # iterate buffer in reverse to pick tail paragraphs
+                for p, p_pg in zip(reversed(buffer), reversed(buffer_pages)):
+                    pt = approx_token_count(p)
+                    tail_buffer.insert(0, p)
+                    tail_pages.insert(0, p_pg)
+                    tail_tokens += pt
+                    if tail_tokens >= overlap_tokens:
+                        break
+                buffer = tail_buffer
+                buffer_pages = tail_pages
+                buffer_tokens = tail_tokens
+            # append current paragraph
             buffer.append(para)
             buffer_pages.append(page_idx)
             buffer_tokens += t
+    # final flush
+    if buffer:
+        make_chunk()
     return chunks

rag/ingest_net.py CHANGED Viewed

@@ -1,7 +1,8 @@
 # rag/ingest_net.py
 import uuid
 import os
-from concurrent.futures import ThreadPoolExecutor
 from typing import List, Dict, Any, Tuple
 from core.books.fetch_url import get_pdf_bytes
@@ -12,9 +13,8 @@ from core.books.storage import (
     mark_raw_status,
 )
 from schemas.books.sources_schema import DocRaw
-from .rag_engine_sources import ArabicBookRAGWithSources
-MAX_INGEST_WORKERS = int(os.getenv("MAX_INGEST_WORKERS", "3"))
 MIN_PDF_SIZE_KB = int(os.getenv("MIN_PDF_SIZE_KB", "60"))
 EMBEDDING_MODEL = os.getenv(
@@ -26,115 +26,158 @@ HEAD_PAGES_N = int(os.getenv("HEAD_PAGES_N", "5"))
 def ingest_from_net(user_id: str, book_id: str, sources: List[Dict[str, Any]]):
     init_db()
     if not check_db_health():
         raise RuntimeError("Supabase / DB is not reachable")
-    rag = ArabicBookRAGWithSources(
-        user_id=user_id, book_id=book_id, embedding_model=EMBEDDING_MODEL
-    )
-    def worker(src: Dict[str, Any]) -> Dict[str, Any]:
-        url = src["url"]
         doc_id = str(uuid.uuid4())
-        # ---------- Fetch PDF ----------
-        pdf_bytes = get_pdf_bytes(url)
-        if not pdf_bytes:
-            return {"url": url, "status": "rejected", "reason": "no_pdf_or_blocked"}
-        if len(pdf_bytes) < MIN_PDF_SIZE_KB * 1024:
-            return {"url": url, "status": "rejected", "reason": "pdf_too_small"}
-        # ---------- Extract text / OCR ----------
-        from .pdf_text import extract_text_pypdf2, is_text_usable
-        from .ocr import mistral_ocr_pdf
-        pages = extract_text_pypdf2(pdf_bytes)
-        joined = "\n".join(pages)
-        if is_text_usable(joined):
-            print(f"Done pypdf2✅ | pages={len(pages)}")
-            extraction_method = "text"
-        else:
-            if len(pdf_bytes) > 10 * 1024 * 1024:
-                return {
-                    "url": url,
-                    "status": "rejected",
-                    "reason": "pdf_too_large_for_ocr",
-                }
-            try:
-                pages = mistral_ocr_pdf(pdf_bytes)  # يفترض بيرجع list[str] صفحات
-                extraction_method = "ocr"
-            except Exception:
-                return {"url": url, "status": "rejected", "reason": "ocr_failed"}
-        if not pages:
-            return {"url": url, "status": "rejected", "reason": "no_text_extracted"}
-        # ---------- Preprocess for Arabic (optional) ----------
-        from .preprocess import normalize_arabic, drop_common_headers_footers
-        language = src.get("language", "ar")
-        if language == "ar":
-            pages = [normalize_arabic(p) for p in pages]
-            pages = drop_common_headers_footers(pages)
-        pages_head = pages[:HEAD_PAGES_N]
-        # ---------- Store RAW in Supabase (Phase 1) ----------
-        raw_doc = DocRaw(
-            doc_id=doc_id,
-            user_id=user_id,
-            book_id=book_id,
-            source_url=url,
-            source_type=src.get("source_type", "pdf"),
-            domain=src.get("domain", ""),
-            language=language,
-            pages_head=pages_head,
-            extraction_method=extraction_method,
-            pdf_size_bytes=len(pdf_bytes),
-            status="pending",
-            error_reason="",
-        )
-        try:
-            insert_raw_document(raw_doc)
-        except Exception as e:
-            return {
-                "url": url,
-                "status": "rejected",
-                "reason": f"db_insert_failed: {str(e)}",
-            }
-        # ---------- Ingest all pages into Qdrant (chunks only) ----------
         try:
             stats = rag.ingest_pages(pages=pages, raw_doc=raw_doc)
         except Exception as e:
-            # لو qdrant فشل خلّي الحالة failed
-            mark_raw_status(doc_id, "failed", f"qdrant_failed: {str(e)}")
-            return {
-                "url": url,
-                "status": "rejected",
-                "reason": "qdrant_failed",
-                "doc_id": doc_id,
-            }
-        return {
-            "url": url,
-            "status": "ingested",
-            "doc_id": doc_id,
-            "source_type": raw_doc.source_type,
-            "extraction": extraction_method,
-            "stats": stats,
-        }
-    items = []
-    with ThreadPoolExecutor(max_workers=MAX_INGEST_WORKERS) as ex:
-        for result in ex.map(worker, sources):
-            items.append(result)
-    ingested = sum(1 for x in items if x["status"] == "ingested")
-    rejected = len(items) - ingested
-    return items, ingested, rejected, rag.collection

 # rag/ingest_net.py
 import uuid
 import os
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import List, Dict, Any, Tuple
 from core.books.fetch_url import get_pdf_bytes
     mark_raw_status,
 )
 from schemas.books.sources_schema import DocRaw
+MAX_INGEST_WORKERS = int(os.getenv("MAX_INGEST_WORKERS", "6"))
 MIN_PDF_SIZE_KB = int(os.getenv("MIN_PDF_SIZE_KB", "60"))
 EMBEDDING_MODEL = os.getenv(
 def ingest_from_net(user_id: str, book_id: str, sources: List[Dict[str, Any]]):
+    """
+    Two-phase ingest:
+      Phase A (concurrent): fetch PDFs, extract text/OCR, preprocess, insert raw doc to DB.
+      Phase B (sequential): load embedder once and ingest pages (encode + upsert) to Qdrant.
+    This avoids loading the embedding model many times (huge slow-down).
+    """
     init_db()
     if not check_db_health():
         raise RuntimeError("Supabase / DB is not reachable")
+    collection_name = f"user_{user_id}__book_{book_id}"
+    # Worker for phase A: fetch + extract + preprocess + insert_raw_document
+    def fetch_and_prepare(src: Dict[str, Any]) -> Dict[str, Any]:
+        url = src.get("url")
         doc_id = str(uuid.uuid4())
+        start = time.time()
+        result = {"url": url, "doc_id": doc_id, "status": "rejected", "reason": None}
+        try:
+            pdf_bytes = get_pdf_bytes(url)
+            if not pdf_bytes:
+                result["reason"] = "no_pdf_or_blocked"
+                return result
+            if len(pdf_bytes) < MIN_PDF_SIZE_KB * 1024:
+                result["reason"] = "pdf_too_small"
+                return result
+            # delayed imports for optional OCR libs
+            from .pdf_text import extract_text_pypdf2, is_text_usable
+            from .ocr import mistral_ocr_pdf
+            pages = extract_text_pypdf2(pdf_bytes)
+            joined = "\n".join(pages)
+            if is_text_usable(joined):
+                extraction_method = "text"
+            else:
+                if len(pdf_bytes) > 10 * 1024 * 1024:
+                    result["reason"] = "pdf_too_large_for_ocr"
+                    return result
+                try:
+                    pages = mistral_ocr_pdf(pdf_bytes)
+                    extraction_method = "ocr"
+                except Exception as e:
+                    result["reason"] = f"ocr_failed:{str(e)}"
+                    return result
+            if not pages:
+                result["reason"] = "no_text_extracted"
+                return result
+            # preprocess
+            from .preprocess import normalize_arabic, drop_common_headers_footers
+            language = src.get("language", "ar")
+            if language == "ar":
+                pages = [normalize_arabic(p) for p in pages]
+                pages = drop_common_headers_footers(pages)
+            pages_head = pages[:HEAD_PAGES_N]
+            raw_doc = DocRaw(
+                doc_id=doc_id,
+                user_id=user_id,
+                book_id=book_id,
+                source_url=url,
+                source_type=src.get("source_type", "pdf"),
+                domain=src.get("domain", ""),
+                language=language,
+                pages_head=pages_head,
+                extraction_method=extraction_method,
+                pdf_size_bytes=len(pdf_bytes),
+                status="pending",
+                error_reason="",
+            )
+            # insert raw doc (phase 1)
+            insert_raw_document(raw_doc)
+            result.update(
+                {
+                    "status": "prepared",
+                    "doc_id": doc_id,
+                    "raw_doc": raw_doc,
+                    "pages": pages,
+                    "extraction": extraction_method,
+                    "duration": time.time() - start,
+                }
+            )
+            return result
+        except Exception as e:
+            result["reason"] = f"fetch_prepare_failed:{str(e)}"
+            return result
+    # Phase A: concurrent fetch + prepare
+    prepared_items = []
+    with ThreadPoolExecutor(max_workers=MAX_INGEST_WORKERS) as ex:
+        futures = {ex.submit(fetch_and_prepare, src): src for src in sources}
+        for fut in as_completed(futures):
+            res = fut.result()
+            prepared_items.append(res)
+    # Phase B: sequential embedding & qdrant upsert with single embedder instance
+    from .rag_engine_sources import ArabicBookRAGWithSources
+    rag = ArabicBookRAGWithSources(
+        user_id=user_id, book_id=book_id, embedding_model=EMBEDDING_MODEL
+    )
+    items_out = []
+    ingested = 0
+    rejected = 0
+    for item in prepared_items:
+        if item.get("status") != "prepared":
+            items_out.append(item)
+            rejected += 1
+            continue
+        doc_id = item["doc_id"]
+        pages = item["pages"]
+        raw_doc = item["raw_doc"]
         try:
             stats = rag.ingest_pages(pages=pages, raw_doc=raw_doc)
+            items_out.append(
+                {
+                    "url": item["url"],
+                    "status": "ingested",
+                    "doc_id": doc_id,
+                    "source_type": raw_doc.source_type,
+                    "extraction": item["extraction"],
+                    "stats": stats,
+                }
+            )
+            ingested += 1
         except Exception as e:
+            # mark failed on db
+            try:
+                mark_raw_status(doc_id, "failed", f"qdrant_failed:{str(e)}")
+            except Exception:
+                pass
+            items_out.append(
+                {
+                    "url": item["url"],
+                    "status": "rejected",
+                    "reason": f"qdrant_failed:{str(e)}",
+                    "doc_id": doc_id,
+                }
+            )
+            rejected += 1
+    return items_out, ingested, rejected, rag.collection

rag/rag_engine_sources.py CHANGED Viewed

@@ -1,24 +1,28 @@
 # rag/rag_engine_sources.py
 import os
 import uuid
-from dataclasses import asdict
 from typing import List, Optional
-from .preprocess import normalize_arabic  # عدّل المسار حسب مشروعك
 from sentence_transformers import SentenceTransformer
 from qdrant_client import QdrantClient
 from qdrant_client.http import models as qm
 from schemas.books.sources_schema import ChunkRecord, DocRaw
 class ArabicBookRAGWithSources:
-    def __init__(self, user_id: str, book_id: str, embedding_model: str):
         self.user_id = user_id
         self.book_id = book_id
         self.collection = f"user_{user_id}__book_{book_id}"
         self.embedder = SentenceTransformer(embedding_model)
         self.qdrant = QdrantClient(
             url=os.environ["QDRANT_URL"],
@@ -28,7 +32,12 @@ class ArabicBookRAGWithSources:
     def _ensure_collection(self):
         dim = self.embedder.get_sentence_embedding_dimension()
-        collections = [c.name for c in self.qdrant.get_collections().collections]
         if self.collection not in collections:
             self.qdrant.create_collection(
                 collection_name=self.collection,
@@ -36,41 +45,47 @@ class ArabicBookRAGWithSources:
             )
     def ingest_pages(self, pages: List[str], raw_doc: DocRaw):
-        from .chuncking import chunk_pages  # نفس اسم ملفك
         chunks = chunk_pages(pages)
-        # NOTE: title/authors/apa فاضيين دلوقتي (مرحلة 1)
-        records = [
-            ChunkRecord(
-                chunk_id=str(uuid.uuid4()),
-                user_id=self.user_id,
-                book_id=self.book_id,
-                doc_id=raw_doc.doc_id,
-                source_url=raw_doc.source_url,
-                source_type=raw_doc.source_type,
-                domain=raw_doc.domain,
-                title="",
-                authors="",
-                year=None,
-                publisher_or_journal="",
-                language=raw_doc.language,
-                apa7="",
-                page_start=ps,
-                page_end=pe,
-                text=txt,
             )
-            for txt, ps, pe in chunks
-        ]
-        vectors = self.embedder.encode(
-            [r.text for r in records], normalize_embeddings=True
-        )
-        # payload minimal (بس سيبنا asdict فيه حقول زيادة فاضية - لو تحب نقلل أكتر هنعمل dict يدوي)
-        self.qdrant.upsert(
-            collection_name=self.collection,
-            points=[
                 qm.PointStruct(
                     id=r.chunk_id,
                     vector=v.tolist(),
@@ -88,9 +103,11 @@ class ArabicBookRAGWithSources:
                         "text": r.text,
                     },
                 )
-                for r, v in zip(records, vectors)
-            ],
-        )
         return {"pages": len(pages), "chunks": len(records)}
@@ -100,17 +117,9 @@ class ArabicBookRAGWithSources:
         doc_id: Optional[str] = None,
         top_k: int = 8,
     ):
-        """
-        Semantic retrieval from Qdrant collection.
-        Returns list of scored chunks.
-        """
         if not queries:
             return []
-        # -------------------------
-        # Optional filter
-        # -------------------------
         must = []
         if doc_id:
             must.append(
@@ -129,23 +138,10 @@ class ArabicBookRAGWithSources:
             if not q:
                 continue
-            # normalize Arabic
             q_norm = normalize_arabic(q)
-            # embed
-            vec = self.embedder.encode(
-                [q_norm],
-                normalize_embeddings=True,
-            )[0]
-            # search
-            # res = self.qdrant.search(
-            #     collection_name=self.collection,
-            #     query_vector=vec.tolist(),
-            #     limit=top_k,
-            #     with_payload=True,
-            #     query_filter=query_filter,
-            # )
             res = self.qdrant.query_points(
                 collection_name=self.collection,
                 query=vec.tolist(),

 # rag/rag_engine_sources.py
 import os
 import uuid
 from typing import List, Optional
+from dataclasses import asdict
 from sentence_transformers import SentenceTransformer
 from qdrant_client import QdrantClient
 from qdrant_client.http import models as qm
 from schemas.books.sources_schema import ChunkRecord, DocRaw
+from .preprocess import normalize_arabic
+from .chuncking import chunk_pages  # صحّحت اسم الملف من chuncking -> chunking
 class ArabicBookRAGWithSources:
+    def __init__(
+        self, user_id: str, book_id: str, embedding_model: str, batch_size: int = 128
+    ):
         self.user_id = user_id
         self.book_id = book_id
         self.collection = f"user_{user_id}__book_{book_id}"
+        self.batch_size = batch_size
+        # load embedder once per instance (this is why ingest_from_net uses single instance)
         self.embedder = SentenceTransformer(embedding_model)
         self.qdrant = QdrantClient(
             url=os.environ["QDRANT_URL"],
     def _ensure_collection(self):
         dim = self.embedder.get_sentence_embedding_dimension()
+        existing = self.qdrant.get_collections()
+        collections = (
+            [c.name for c in existing.collections]
+            if existing and getattr(existing, "collections", None)
+            else []
+        )
         if self.collection not in collections:
             self.qdrant.create_collection(
                 collection_name=self.collection,
             )
     def ingest_pages(self, pages: List[str], raw_doc: DocRaw):
+        """
+        Create chunks (with page ranges), embed in batches, and upsert to Qdrant in batches.
+        Returns stats dict.
+        """
         chunks = chunk_pages(pages)
+        records = []
+        for txt, ps, pe in chunks:
+            records.append(
+                ChunkRecord(
+                    chunk_id=str(uuid.uuid4()),
+                    user_id=self.user_id,
+                    book_id=self.book_id,
+                    doc_id=raw_doc.doc_id,
+                    source_url=raw_doc.source_url,
+                    source_type=raw_doc.source_type,
+                    domain=raw_doc.domain,
+                    title="",
+                    authors="",
+                    year=None,
+                    publisher_or_journal="",
+                    language=raw_doc.language,
+                    apa7="",
+                    page_start=ps,
+                    page_end=pe,
+                    text=txt,
+                )
             )
+        # encode in batches to save memory/time
+        vectors = []
+        texts = [r.text for r in records]
+        for i in range(0, len(texts), self.batch_size):
+            batch_texts = texts[i : i + self.batch_size]
+            batch_vecs = self.embedder.encode(batch_texts, normalize_embeddings=True)
+            vectors.extend(batch_vecs)
+        # upsert to Qdrant in batches
+        points = []
+        for r, v in zip(records, vectors):
+            points.append(
                 qm.PointStruct(
                     id=r.chunk_id,
                     vector=v.tolist(),
                         "text": r.text,
                     },
                 )
+            )
+        for i in range(0, len(points), self.batch_size):
+            batch = points[i : i + self.batch_size]
+            self.qdrant.upsert(collection_name=self.collection, points=batch)
         return {"pages": len(pages), "chunks": len(records)}
         doc_id: Optional[str] = None,
         top_k: int = 8,
     ):
         if not queries:
             return []
         must = []
         if doc_id:
             must.append(
             if not q:
                 continue
             q_norm = normalize_arabic(q)
+            vec = self.embedder.encode([q_norm], normalize_embeddings=True)[0]
+            # use query_points (or search depending on client version)
             res = self.qdrant.query_points(
                 collection_name=self.collection,
                 query=vec.tolist(),