Spaces:

ikram98ai
/

hierRAG

Sleeping

App Files Files Community

ikram98ai commited on Nov 6, 2025

Commit

d8dedbc

1 Parent(s): 4eadf6e

refactoring ingest function

Browse files

Files changed (4) hide show

src/app.py +11 -3
src/core/ingest.py +20 -7
src/core/rag.ipynb +127 -12
src/core/retrieval.py +3 -0

src/app.py CHANGED Viewed

@@ -10,7 +10,7 @@ _project_root = Path(__file__).resolve().parents[1]
 if str(_project_root) not in sys.path:
     sys.path.insert(0, str(_project_root))
-from src.core.ingest import ingest
 from src.core.retrieval import generate, retrieval
 from src.core.index import MetaData
 from src.core.synthetic_data import EVAL_QUERIES, SYNTHETIC_DOCUMENTS
@@ -48,8 +48,16 @@ def ingest_files(files:List[str], index_name:str, lang:Literal["en", "ja"], doma
     filter_data = MetaData(
         language=lang, domain=domain, section=section, topic=topic, doc_type=doc_type
     )
-    result = ingest(index_name, filter_data, files)
-    return {"status": "success", "message": result}
 def _add_metric(doc):
     return (f"\n### source: {doc.metadata.get('source_name','None')}"

 if str(_project_root) not in sys.path:
     sys.path.insert(0, str(_project_root))
+from src.core.ingest import load_documents, get_chunks, ingest_documents
 from src.core.retrieval import generate, retrieval
 from src.core.index import MetaData
 from src.core.synthetic_data import EVAL_QUERIES, SYNTHETIC_DOCUMENTS
     filter_data = MetaData(
         language=lang, domain=domain, section=section, topic=topic, doc_type=doc_type
     )
+    try:
+        docs = load_documents(files)
+        chunks = get_chunks(docs, filter_data)
+        message = ingest_documents(chunks, index_name)
+    except Exception as e:
+        message = f"Error during ingestion: {str(e)}"
+        print(message)
+        return {"status": "error", "message": message}
+    return {"status": "success", "message": message}
 def _add_metric(doc):
     return (f"\n### source: {doc.metadata.get('source_name','None')}"

src/core/ingest.py CHANGED Viewed

@@ -5,7 +5,9 @@ from langchain_openai import ChatOpenAI
 from dotenv import load_dotenv, find_dotenv
 from typing import List
 import uuid
-from .index import get_vectorstore, MetaData
 from .utils import mask_pii
 find_dotenv()
@@ -14,18 +16,24 @@ load_dotenv()
 model = ChatOpenAI(model="gpt-5-nano")
-def ingest(file_paths: List[str], collection_name: str, metadata: MetaData):
     documents: list[Document] = []
     for file_path in file_paths:
         if file_path.endswith(".txt"):
             docs = TextLoader(file_path, encoding="utf-8").load()
         elif file_path.endswith(".pdf"):
             docs = PDFMinerLoader(file_path).load()
         documents.extend(docs)
-        for doc in docs:
-            doc.metadata["source"] = file_path.split("/")[-1]
     print(f"loaded {len(documents)} documents from {len(file_paths)} files.")
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1200,  # chunk size (characters)
         chunk_overlap=200,  # chunk overlap (characters)
@@ -35,23 +43,28 @@ def ingest(file_paths: List[str], collection_name: str, metadata: MetaData):
     print(f"generated {len(chunks)} chunks.")
     doc_id = str(uuid.uuid4())
-    docs = [
         Document(
             page_content=mask_pii(chunk.page_content),
             metadata={
                 "doc_id": doc_id,
                 "chunk_id": str(uuid.uuid4()),
-                "source_name": chunk.metadata["source"],
                 "start_index": chunk.metadata["start_index"],
                 **metadata.model_dump(),
             },
         )
         for chunk in chunks
     ]
     vectorstore = get_vectorstore(collection_name)
     ids = [str(uuid.uuid4()) for _ in range(len(docs))]
     vectorstore.add_documents(docs, ids=ids)
     success_message = f"Ingested {len(docs)} documents into {collection_name} index."
     print(success_message)
     return success_message

 from dotenv import load_dotenv, find_dotenv
 from typing import List
 import uuid
+from src.core.index import get_vectorstore
+from .index import MetaData
 from .utils import mask_pii
 find_dotenv()
 model = ChatOpenAI(model="gpt-5-nano")
+def load_documents(file_paths: List[str]):
+    """Ingest files into vectorstore after processing and chunking."""
     documents: list[Document] = []
     for file_path in file_paths:
         if file_path.endswith(".txt"):
             docs = TextLoader(file_path, encoding="utf-8").load()
         elif file_path.endswith(".pdf"):
             docs = PDFMinerLoader(file_path).load()
+        else:
+            print(f"Unsupported file format: {file_path}")
+            continue
         documents.extend(docs)
     print(f"loaded {len(documents)} documents from {len(file_paths)} files.")
+    return documents
+def get_chunks(documents: List[Document], metadata: MetaData):
+    """Split documents into chunks and mask PII."""
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1200,  # chunk size (characters)
         chunk_overlap=200,  # chunk overlap (characters)
     print(f"generated {len(chunks)} chunks.")
     doc_id = str(uuid.uuid4())
+    chunks = [
         Document(
             page_content=mask_pii(chunk.page_content),
             metadata={
                 "doc_id": doc_id,
                 "chunk_id": str(uuid.uuid4()),
+                "source_name": chunk.metadata["source"].split("/")[-1],
                 "start_index": chunk.metadata["start_index"],
                 **metadata.model_dump(),
             },
         )
         for chunk in chunks
     ]
+    return chunks
+def ingest_documents(docs: List[Document], collection_name: str):
+    """Ingest documents into the specified vectorstore collection."""
     vectorstore = get_vectorstore(collection_name)
     ids = [str(uuid.uuid4()) for _ in range(len(docs))]
     vectorstore.add_documents(docs, ids=ids)
     success_message = f"Ingested {len(docs)} documents into {collection_name} index."
     print(success_message)
     return success_message

src/core/rag.ipynb CHANGED Viewed

@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "id": "db72701e",
    "metadata": {},
    "outputs": [],
@@ -78,12 +78,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "id": "f6037cfd",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from langchain_community.document_loaders import PDFMinerLoader\n",
     "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
     "from langchain_core.documents import Document\n",
     "from langchain_openai import ChatOpenAI\n",
@@ -96,17 +96,81 @@
     "load_dotenv()\n",
     "\n",
     "model = ChatOpenAI(model=\"gpt-5-nano\")\n",
     "\n",
-    "\n",
-    "def ingest(file_paths: List[str], collection_name: str, metadata: MetaData):\n",
     "    documents: list[Document] = []\n",
     "    for file_path in file_paths:\n",
-    "        docs = PDFMinerLoader(file_path).load()\n",
     "        documents.extend(docs)\n",
-    "        for doc in docs:\n",
-    "            doc.metadata[\"source\"] = file_path.split(\"/\")[-1]\n",
-    "          \n",
     "    print(f\"loaded {len(documents)} documents from {len(file_paths)} files.\")\n",
     "    text_splitter = RecursiveCharacterTextSplitter(\n",
     "        chunk_size=1200,  # chunk size (characters)\n",
     "        chunk_overlap=200,  # chunk overlap (characters)\n",
@@ -116,21 +180,72 @@
     "    print(f\"generated {len(chunks)} chunks.\")\n",
     "\n",
     "    doc_id = str(uuid.uuid4())\n",
-    "    docs = [\n",
     "        Document(\n",
     "            page_content=mask_pii(chunk.page_content),\n",
     "            metadata={\n",
     "                \"doc_id\": doc_id,\n",
     "                \"chunk_id\": str(uuid.uuid4()),\n",
-    "                \"source_name\": chunk.metadata[\"source\"],\n",
-    "                \"total_pages\": chunk.metadata[\"total_pages\"],\n",
     "                \"start_index\": chunk.metadata[\"start_index\"],\n",
     "                **metadata.model_dump(),\n",
     "            },\n",
     "        )\n",
     "        for chunk in chunks\n",
     "    ]\n",
     "\n",
     "    vectorstore = get_vectorstore(collection_name)\n",
     "    ids = [str(uuid.uuid4()) for _ in range(len(docs))]\n",
     "    vectorstore.add_documents(docs, ids=ids)\n",

   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "id": "db72701e",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "id": "f6037cfd",
    "metadata": {},
    "outputs": [],
    "source": [
+    "from langchain_community.document_loaders import PDFMinerLoader, TextLoader\n",
     "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
     "from langchain_core.documents import Document\n",
     "from langchain_openai import ChatOpenAI\n",
     "load_dotenv()\n",
     "\n",
     "model = ChatOpenAI(model=\"gpt-5-nano\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "03501b3b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "\n",
+    "def load_documents(file_paths: List[str]):\n",
+    "    \"\"\"Ingest files into vectorstore after processing and chunking.\"\"\"\n",
     "    documents: list[Document] = []\n",
     "    for file_path in file_paths:\n",
+    "        if file_path.endswith(\".txt\"):\n",
+    "            docs = TextLoader(file_path, encoding=\"utf-8\").load()\n",
+    "        elif file_path.endswith(\".pdf\"):\n",
+    "            docs = PDFMinerLoader(file_path).load()\n",
     "        documents.extend(docs)\n",
+    "\n",
     "    print(f\"loaded {len(documents)} documents from {len(file_paths)} files.\")\n",
+    "    return documents\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "0b901011",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "loaded 1 documents from 1 files.\n"
+     ]
+    }
+   ],
+   "source": [
+    "doc = load_documents([\"../../data/gemma.pdf\",])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "3381a9e7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'../../data/gemma.pdf'"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "doc[0].metadata[\"source\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "67879c7d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "def get_chunks(documents: List[Document], metadata: MetaData):\n",
+    "    \"\"\"Split documents into chunks and mask PII.\"\"\"\n",
     "    text_splitter = RecursiveCharacterTextSplitter(\n",
     "        chunk_size=1200,  # chunk size (characters)\n",
     "        chunk_overlap=200,  # chunk overlap (characters)\n",
     "    print(f\"generated {len(chunks)} chunks.\")\n",
     "\n",
     "    doc_id = str(uuid.uuid4())\n",
+    "    chunks = [\n",
     "        Document(\n",
     "            page_content=mask_pii(chunk.page_content),\n",
     "            metadata={\n",
     "                \"doc_id\": doc_id,\n",
     "                \"chunk_id\": str(uuid.uuid4()),\n",
+    "                \"source_name\": chunk.metadata[\"source\"].split(\"/\")[-1],\n",
     "                \"start_index\": chunk.metadata[\"start_index\"],\n",
     "                **metadata.model_dump(),\n",
     "            },\n",
     "        )\n",
     "        for chunk in chunks\n",
     "    ]\n",
+    "    return chunks\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "8ab1d37f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "generated 72 chunks.\n"
+     ]
+    }
+   ],
+   "source": [
+    "chunks = get_chunks(doc, MetaData(language=\"en\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "f739cda1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_items([('doc_id', '8cf509fb-56d0-436b-b712-3a12e092c60f'), ('chunk_id', '2fe270b3-4d8c-4629-9d6b-e36e93f3294b'), ('source_name', 'gemma.pdf'), ('start_index', 0), ('language', 'en'), ('domain', None), ('section', None), ('topic', None), ('doc_type', None)])"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chunks[0].metadata.items()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "df9c6181",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
     "\n",
+    "def ingest_to_vectorstore(docs: List[Document], collection_name: str):\n",
+    "    \"\"\"Ingest documents into the specified vectorstore collection.\"\"\"\n",
     "    vectorstore = get_vectorstore(collection_name)\n",
     "    ids = [str(uuid.uuid4()) for _ in range(len(docs))]\n",
     "    vectorstore.add_documents(docs, ids=ids)\n",

src/core/retrieval.py CHANGED Viewed

@@ -12,6 +12,7 @@ model = ChatOpenAI(model="gpt-5-nano")
 def reranker(query: str, docs: List[Document]) -> List[Document]:
     print(f"Retrieved {len(docs)} documents")
     if len(docs) <= 1:
         return docs
@@ -24,6 +25,7 @@ def reranker(query: str, docs: List[Document]) -> List[Document]:
 def retrieval(
     query: str, collection_name: str, filter_data: MetaData
 ) -> List[tuple[Document, float]]:
     vectorstore = get_vectorstore(collection_name)
     print(
         f"RETRIEVAL query: {query[:40]}, for {collection_name} collection, with filters: {filter_data}"
@@ -57,6 +59,7 @@ def retrieval(
 def generate(query: str, ctx_docs: List[Document]) -> str:
     context = "\n".join([doc.page_content for doc in ctx_docs])
     prompt = f"""Answer shortly to the user question according to the given context. Only answer if the context is given to you.
     question: {query}

 def reranker(query: str, docs: List[Document]) -> List[Document]:
+    """Rerank documents using BM25Retriever"""
     print(f"Retrieved {len(docs)} documents")
     if len(docs) <= 1:
         return docs
 def retrieval(
     query: str, collection_name: str, filter_data: MetaData
 ) -> List[tuple[Document, float]]:
+    """Retrieve relevant documents from the vector store based on the query and filters."""
     vectorstore = get_vectorstore(collection_name)
     print(
         f"RETRIEVAL query: {query[:40]}, for {collection_name} collection, with filters: {filter_data}"
 def generate(query: str, ctx_docs: List[Document]) -> str:
+    """Generate answer using the language model based on the query and context documents."""
     context = "\n".join([doc.page_content for doc in ctx_docs])
     prompt = f"""Answer shortly to the user question according to the given context. Only answer if the context is given to you.
     question: {query}