Spaces:

samiali12
/

medrag-assistant

Sleeping

App Files Files Community

Sami Ali commited on Oct 1, 2025

Commit

51349bc

1 Parent(s): 7f5929f

implement pipline for magrag-assistant

Browse files

Files changed (8) hide show

.gitignore +3 -0
app.py +12 -0
madrag.ipynb +113 -0
src/__init__.py +0 -0
src/constant.py +3 -0
src/data_processor.py +108 -0
src/embedding.py +26 -0
src/vectorstore.py +91 -0

.gitignore CHANGED Viewed

@@ -3,6 +3,9 @@ __pycache__/
 *.py[codz]
 *$py.class
 # C extensions
 *.so

 *.py[codz]
 *$py.class
+# data
+data
 # C extensions
 *.so

app.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from src.data_processor import DataProcessor
+from src.embedding import EmbeddingManager
+from src.vectorstore import VectorStore
+if __name__ == '__main__':
+    dp = DataProcessor()
+    chunks, document = dp.build()
+    embd = EmbeddingManager()
+    chunks_embedding = embd.embed_texts(chunks)
+    vectorstore = VectorStore()
+    vectorstore.add_documents(chunks, chunks_embedding)
+    retriver = vectorstore.get_retriever()

madrag.ipynb ADDED Viewed

	@@ -0,0 +1,113 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "c80e0812",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.text_splitter import CharacterTextSplitter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "bbc6a9d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('./data/pmc/PMC10000000.txt', \"r\", encoding='utf-8') as file:\n",
+    "    data = file.read()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "9eba0782",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "23842\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(len(data))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "c0b716f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chunks = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0, separator=' ')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "14aa384c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "temp = chunks.split_text(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "77187982",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "24"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(temp)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c254a11",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

src/__init__.py ADDED Viewed

File without changes

src/constant.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import os
2	+
3	+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

src/data_processor.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+from src.constant import BASE_DIR
+from langchain.schema import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+DATA_DIR = os.path.join(BASE_DIR, "data", "pmc")
+class DataProcessor:
+    """
+    Handles loading, cleaning, and chunking of text files
+    from the PubMed Central (PMC) dataset.
+    """
+    def __init__(self, data_path: str = DATA_DIR):
+        self.data_path = data_path
+    def _load_files(self) -> list[dict]:
+        """
+        Load raw text files from the dataset directory.
+        Returns a list of dictionaries with file name and raw content.
+        """
+        count = 0
+        data_list = []
+        for file_name in os.listdir(self.data_path):
+            if not file_name.endswith(".txt"):
+                continue
+            file_path = os.path.join(self.data_path, file_name)
+            with open(file_path, "r", encoding="utf-8") as file_ref:
+                data_list.append(
+                    {
+                        "file_name": file_name,
+                        "page_content": file_ref.read()
+                    }
+                )
+            if count >= 2:
+                break
+            count += 1
+        return data_list
+    @staticmethod
+    def _decode_unicode(text: str) -> str:
+        """
+        Convert escaped unicode sequences to proper text.
+        """
+        if not isinstance(text, str):
+            return text
+        try:
+            return text.encode("utf-8").decode("unicode-escape")
+        except Exception:
+            return text
+    def _preprocess(self, data: list[dict]) -> list[dict]:
+        """
+        Apply preprocessing steps (e.g., unicode decoding) to raw data.
+        """
+        cleaned_data = []
+        for record in data:
+            decoded_text = self._decode_unicode(record["page_content"])
+            cleaned_data.append(
+                {
+                    "file_name": record["file_name"],
+                    "page_content": decoded_text
+                }
+            )
+        return cleaned_data
+    def load_documents(self) -> list[Document]:
+        """
+        Load and preprocess text files, converting them into
+        LangChain Document objects.
+        """
+        raw_data = self._load_files()
+        cleaned_data = self._preprocess(raw_data)
+        return [
+            Document(
+                page_content=item["page_content"],
+                metadata={"source": item["file_name"]}
+            )
+            for item in cleaned_data
+        ]
+    @staticmethod
+    def chunk_documents(documents: list[Document],
+                        chunk_size: int = 1000,
+                        chunk_overlap: int = 200) -> list[Document]:
+        """
+        Split documents into smaller chunks for embedding and retrieval.
+        """
+        splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            length_function=len
+        )
+        return splitter.split_documents(documents)
+    def build(self) -> tuple[list[Document], list[Document]]:
+        """
+        End-to-end pipeline:
+        - Load documents
+        - Chunk them
+        Returns (chunks, original documents).
+        """
+        documents = self.load_documents()
+        chunks = self.chunk_documents(documents)
+        return chunks, documents

src/embedding.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from typing import List
+import numpy as np
+from langchain_huggingface import HuggingFaceEmbeddings
+class EmbeddingManager:
+    def __init__(self, model_name: str = "pritamdeka/S-BioBERT-snli-multinli-stsb"):
+        self.model_name = model_name
+        self.model = None
+        self.load_model()
+    def load_model(self):
+        print("Loading embedding model:", self.model_name)
+        self.model = HuggingFaceEmbeddings(model_name=self.model_name)
+        print("Model loaded.")
+    def get_model(self):
+        return self.model
+    def embed_texts(self, texts: List[str]) -> np.ndarray:
+        if self.model is None:
+            raise RuntimeError("Model not loaded. Call load_model() first.")
+        return self.model.embed_documents(texts)
+    def embed_one(self, text: str) -> np.ndarray:
+        return self.model.embed_query(text)

src/vectorstore.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import numpy as np
+from typing import List
+from pathlib import Path
+from src.constant import BASE_DIR
+import chromadb
+from langchain.vectorstores import Chroma
+from langchain.schema import Document
+from uuid import uuid4
+DATA_DIR = os.path.join(BASE_DIR, "db")
+class VectorStore:
+    """
+    Wrapper around Chroma vector database for persistent storage
+    and retrieval of document embeddings.
+    """
+    def __init__(self,
+                 collection_name: str = "medrag",
+                 persist_directory: str = DATA_DIR):
+        self.collection_name = collection_name
+        self.persist_directory = persist_directory
+        self.client = None
+        self.collection = None
+        self._initialize_store()
+    def _initialize_store(self):
+        """Initialize Chroma client and collection."""
+        try:
+            dir_path = Path(self.persist_directory)
+            dir_path.mkdir(parents=True, exist_ok=True)
+            self.client = chromadb.PersistentClient(self.persist_directory)
+            self.collection = self.client.get_or_create_collection(
+                name=self.collection_name,
+                metadata={"description": "RAG collection for biomedical research"}
+            )
+            print(f"Store initialized successfully: {self.collection_name}")
+        except Exception as e:
+            print(f"Error initializing the store: {e}")
+            raise
+    def get_len(self) -> int:
+        """Return number of documents in the collection."""
+        return self.collection.count()
+    def add_documents(self, documents: List[Document], embeddings: np.ndarray, batch_size: int = 5000):
+        """
+        Add documents and their embeddings to the vector store in batches.
+        """
+        if isinstance(embeddings, np.ndarray):
+            embeddings = embeddings.tolist()  # Ensure compatibility
+        for start in range(0, len(documents), batch_size):
+            batch_docs = documents[start:start + batch_size]
+            batch_embeds = embeddings[start:start + batch_size]
+            ids, metadatas, texts, embeds = [], [], [], []
+            for idx, (doc, emb) in enumerate(zip(batch_docs, batch_embeds)):
+                ids.append(f"doc_{uuid4().hex}")
+                texts.append(doc.page_content)
+                metadata = dict(doc.metadata) if getattr(doc, "metadata", None) else {}
+                metadata.update({"doc_index": idx, "content_length": len(doc.page_content)})
+                metadatas.append(metadata)
+                embeds.append(emb)
+            self.collection.add(
+                ids=ids,
+                documents=texts,
+                embeddings=embeds,
+                metadatas=metadatas
+            )
+        print(f"Documents and embeddings added to collection: {self.collection_name}")
+    def get_retriever(self, embedding_function, search_kwargs: dict = None):
+        """
+        Return a retriever interface for semantic search.
+        """
+        if search_kwargs is None:
+            search_kwargs = {"k": 5}
+        vectorstore = Chroma(
+            client=self.client,
+            collection_name=self.collection_name,
+            embedding_function=embedding_function
+        )
+        return vectorstore.as_retriever(search_kwargs=search_kwargs)