Spaces:

Rochane
/

aim

Sleeping

App Files Files Community

Rochane commited on Mar 24

Commit

45620de

1 Parent(s): cc33196

Add file upload RAG: requirements + rag.py

Browse files

Files changed (2) hide show

app/rag.py +173 -12
requirements.txt +1 -0

app/rag.py CHANGED Viewed

@@ -1,6 +1,9 @@
 """RAG layer: load corpus, chunk, embed, and retrieve."""
 import os
 import chromadb
 from sentence_transformers import SentenceTransformer
@@ -13,6 +16,9 @@ TOP_K = 3
 _model: SentenceTransformer | None = None
 _collection: chromadb.Collection | None = None
 def _get_model() -> SentenceTransformer:
@@ -22,6 +28,17 @@ def _get_model() -> SentenceTransformer:
     return _model
 def _approximate_token_split(text: str, size: int, overlap: int) -> list[str]:
     """Split text into chunks of approximately `size` words with `overlap`."""
     words = text.split()
@@ -36,7 +53,7 @@ def _approximate_token_split(text: str, size: int, overlap: int) -> list[str]:
 def _read_txt(path: str) -> str:
-    with open(path, "r", encoding="utf-8") as f:
         return f.read()
@@ -50,15 +67,65 @@ def _read_pdf(path: str) -> str:
         return ""
 def load_corpus() -> None:
-    """Load all .pdf and .txt files from corpus, chunk, embed, store in ChromaDB."""
     global _collection
-    client = chromadb.Client(chromadb.config.Settings(
-        persist_directory=CHROMA_DIR,
-        anonymized_telemetry=False,
-        is_persistent=True,
-    ))
     try:
         client.delete_collection("corpus")
@@ -76,17 +143,16 @@ def load_corpus() -> None:
     all_meta: list[dict] = []
     if not os.path.isdir(CORPUS_DIR):
         return
     for filename in sorted(os.listdir(CORPUS_DIR)):
         filepath = os.path.join(CORPUS_DIR, filename)
-        if filename.lower().endswith(".txt"):
-            text = _read_txt(filepath)
-        elif filename.lower().endswith(".pdf"):
-            text = _read_pdf(filepath)
-        else:
             continue
         if not text.strip():
             continue
@@ -107,6 +173,101 @@ def load_corpus() -> None:
         )
 def retrieve(query: str, top_k: int = TOP_K) -> list[str]:
     """Retrieve the top_k most relevant chunks for a query."""
     if _collection is None or _collection.count() == 0:

 """RAG layer: load corpus, chunk, embed, and retrieve."""
 import os
+import shutil
+import tempfile
+import zipfile
 import chromadb
 from sentence_transformers import SentenceTransformer
 _model: SentenceTransformer | None = None
 _collection: chromadb.Collection | None = None
+_client: chromadb.ClientAPI | None = None
+SUPPORTED_EXTENSIONS = {".txt", ".pdf", ".pptx", ".ppt"}
 def _get_model() -> SentenceTransformer:
     return _model
+def _get_client() -> chromadb.ClientAPI:
+    global _client
+    if _client is None:
+        _client = chromadb.Client(chromadb.config.Settings(
+            persist_directory=CHROMA_DIR,
+            anonymized_telemetry=False,
+            is_persistent=True,
+        ))
+    return _client
 def _approximate_token_split(text: str, size: int, overlap: int) -> list[str]:
     """Split text into chunks of approximately `size` words with `overlap`."""
     words = text.split()
 def _read_txt(path: str) -> str:
+    with open(path, "r", encoding="utf-8", errors="ignore") as f:
         return f.read()
         return ""
+def _read_pptx(path: str) -> str:
+    try:
+        from pptx import Presentation
+        prs = Presentation(path)
+        texts = []
+        for slide in prs.slides:
+            for shape in slide.shapes:
+                if shape.has_text_frame:
+                    for para in shape.text_frame.paragraphs:
+                        text = para.text.strip()
+                        if text:
+                            texts.append(text)
+        return "\n".join(texts)
+    except Exception:
+        return ""
+def _read_file(path: str) -> str:
+    """Read a file based on its extension."""
+    lower = path.lower()
+    if lower.endswith(".txt"):
+        return _read_txt(path)
+    elif lower.endswith(".pdf"):
+        return _read_pdf(path)
+    elif lower.endswith((".pptx", ".ppt")):
+        return _read_pptx(path)
+    return ""
+def _extract_zip(zip_bytes: bytes) -> list[tuple[str, bytes]]:
+    """Extract supported files from a ZIP archive. Returns list of (filename, content)."""
+    results = []
+    with tempfile.TemporaryDirectory() as tmpdir:
+        zip_path = os.path.join(tmpdir, "archive.zip")
+        with open(zip_path, "wb") as f:
+            f.write(zip_bytes)
+        with zipfile.ZipFile(zip_path, "r") as zf:
+            zf.extractall(tmpdir)
+        for root, dirs, files in os.walk(tmpdir):
+            # Skip __MACOSX and hidden directories
+            dirs[:] = [d for d in dirs if not d.startswith((".", "__"))]
+            for fname in files:
+                if fname.startswith("."):
+                    continue
+                ext = os.path.splitext(fname)[1].lower()
+                if ext in SUPPORTED_EXTENSIONS:
+                    fpath = os.path.join(root, fname)
+                    with open(fpath, "rb") as f:
+                        results.append((fname, f.read()))
+    return results
 def load_corpus() -> None:
+    """Load all supported files from corpus, chunk, embed, store in ChromaDB."""
     global _collection
+    client = _get_client()
     try:
         client.delete_collection("corpus")
     all_meta: list[dict] = []
     if not os.path.isdir(CORPUS_DIR):
+        os.makedirs(CORPUS_DIR, exist_ok=True)
         return
     for filename in sorted(os.listdir(CORPUS_DIR)):
         filepath = os.path.join(CORPUS_DIR, filename)
+        ext = os.path.splitext(filename)[1].lower()
+        if ext not in SUPPORTED_EXTENSIONS:
             continue
+        text = _read_file(filepath)
         if not text.strip():
             continue
         )
+def _add_single_file(filename: str, file_bytes: bytes) -> dict:
+    """Process a single file: save to corpus and embed."""
+    global _collection
+    os.makedirs(CORPUS_DIR, exist_ok=True)
+    filepath = os.path.join(CORPUS_DIR, filename)
+    with open(filepath, "wb") as f:
+        f.write(file_bytes)
+    text = _read_file(filepath)
+    if not text.strip():
+        os.remove(filepath)
+        return {"filename": filename, "status": "error", "message": "Texte non extractible"}
+    chunks = _approximate_token_split(text, CHUNK_SIZE, CHUNK_OVERLAP)
+    model = _get_model()
+    if _collection is None:
+        load_corpus()
+        return {"filename": filename, "status": "ok", "chunks": len(chunks)}
+    # Remove old chunks from same file if re-uploading
+    try:
+        existing = _collection.get(where={"source": filename})
+        if existing["ids"]:
+            _collection.delete(ids=existing["ids"])
+    except Exception:
+        pass
+    chunk_ids = [f"{filename}_{i}" for i in range(len(chunks))]
+    metas = [{"source": filename, "chunk_index": i} for i in range(len(chunks))]
+    embeddings = model.encode(chunks).tolist()
+    _collection.add(
+        ids=chunk_ids,
+        embeddings=embeddings,
+        documents=chunks,
+        metadatas=metas,
+    )
+    return {"filename": filename, "status": "ok", "chunks": len(chunks)}
+def add_documents(files: list[tuple[str, bytes]]) -> list[dict]:
+    """Add one or more uploaded files. Handles ZIP extraction automatically."""
+    results = []
+    for filename, file_bytes in files:
+        if filename.lower().endswith(".zip"):
+            extracted = _extract_zip(file_bytes)
+            if not extracted:
+                results.append({"filename": filename, "status": "error",
+                                "message": "Aucun fichier supporte trouve dans le ZIP"})
+                continue
+            for inner_name, inner_bytes in extracted:
+                results.append(_add_single_file(inner_name, inner_bytes))
+        else:
+            results.append(_add_single_file(filename, file_bytes))
+    return results
+def list_documents() -> list[dict]:
+    """List all documents in the corpus directory."""
+    docs = []
+    if not os.path.isdir(CORPUS_DIR):
+        return docs
+    for filename in sorted(os.listdir(CORPUS_DIR)):
+        ext = os.path.splitext(filename)[1].lower()
+        if ext in SUPPORTED_EXTENSIONS:
+            filepath = os.path.join(CORPUS_DIR, filename)
+            size = os.path.getsize(filepath)
+            docs.append({"filename": filename, "size": size})
+    return docs
+def delete_document(filename: str) -> bool:
+    """Delete a document from corpus and its embeddings."""
+    global _collection
+    filepath = os.path.join(CORPUS_DIR, filename)
+    if not os.path.isfile(filepath):
+        return False
+    os.remove(filepath)
+    if _collection is not None:
+        try:
+            existing = _collection.get(where={"source": filename})
+            if existing["ids"]:
+                _collection.delete(ids=existing["ids"])
+        except Exception:
+            pass
+    return True
 def retrieve(query: str, top_k: int = TOP_K) -> list[str]:
     """Retrieve the top_k most relevant chunks for a query."""
     if _collection is None or _collection.count() == 0:

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ sentence-transformers==3.3.1
 pydantic==2.10.4
 python-multipart==0.0.20
 pypdf2==3.0.1
 python-dotenv==1.0.1

 pydantic==2.10.4
 python-multipart==0.0.20
 pypdf2==3.0.1
+python-pptx==1.0.2
 python-dotenv==1.0.1