Spaces:

kanhacoderx
/

Documind-AI

Sleeping

App Files Files Community

kanhacoderx commited on 3 days ago

Commit

6e39c64

verified ·

1 Parent(s): 47ddaf6

Upload 19 files

Browse files

Files changed (19) hide show

.gitignore +72 -0
Src/embeddings/__pycache__/embedder.cpython-313.pyc +0 -0
Src/embeddings/embedder.py +26 -0
Src/embeddings/test.py +8 -0
Src/ingestion/__pycache__/data_loader.cpython-313.pyc +0 -0
Src/ingestion/data_loader.py +36 -0
Src/llm/generator.py +54 -0
Src/llm/test.py +11 -0
Src/pipeline/__pycache__/rag_pipeline.cpython-313.pyc +0 -0
Src/pipeline/rag_pipeline.py +54 -0
Src/retrieval/__pycache__/retriever.cpython-312.pyc +0 -0
Src/retrieval/__pycache__/retriever.cpython-313.pyc +0 -0
Src/retrieval/retriever.py +27 -0
Src/retrieval/test.py +13 -0
Src/vectorstore/__pycache__/faiss_store.cpython-313.pyc +0 -0
Src/vectorstore/faiss_store.py +50 -0
Src/vectorstore/test.py +22 -0
app.py +141 -0
dockerfile +13 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,72 @@

+# =========================
+# Python
+# =========================
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+*.so
+*.egg
+*.egg-info/
+dist/
+build/
+# Virtual Environment
+venv/
+.venv/
+env/
+# Environment Variables
+.env
+# Jupyter
+.ipynb_checkpoints/
+# Logs
+*.log
+# FAISS / Vector DB
+artifacts/
+faiss_index/
+*.faiss
+*.pkl
+# Model Cache
+.cache/
+huggingface/
+transformers_cache/
+# OS Files
+.DS_Store
+Thumbs.db
+# =========================
+# Node / React / Vite
+# =========================
+node_modules/
+frontend/node_modules/
+# Vite Build
+frontend/dist/
+dist/
+# Vercel
+.vercel/
+# npm/yarn
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+# =========================
+# IDE
+# =========================
+.vscode/
+.idea/
+# =========================
+# Temporary Files
+# =========================
+temp/
+tmp/
+*.tmp

Src/embeddings/__pycache__/embedder.cpython-313.pyc ADDED Viewed

Binary file (1.49 kB). View file

Src/embeddings/embedder.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from langchain_huggingface import HuggingFaceEmbeddings
+class Embedder:
+    def __init__(self,model_name:str='sentence-transformers/all-MiniLM-L6-v2'):
+        self.model_name=model_name
+        self.embedding_model=self.load_model()
+    def load_model(self):
+        '''
+        Load Embedding Model
+        '''
+        model=HuggingFaceEmbeddings(
+            model_name=self.model_name
+        )
+        return model
+    def embed_documents(self,documents):
+        '''
+        Conbert Documents Into Embedding
+        '''
+        return self.embedding_model.embed_documents(documents)
+    def embed_query(self,query:str):
+        return self.embedding_model.embed_query(query)

Src/embeddings/test.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from Src.embeddings.embedder import Embedder
+embedder = Embedder()
+text = ["Machine learning is amazing"]
+vector = embedder.embed_documents(text)
+print(len(vector[0]))  # vector dimension

Src/ingestion/__pycache__/data_loader.cpython-313.pyc ADDED Viewed

Binary file (1.69 kB). View file

Src/ingestion/data_loader.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from dataclasses import dataclass
+#Create DataIngestionConfig
+class DataIngestion:
+    def __init__(self,file_path:str):
+        self.file_path=file_path
+    def load_documents(self):
+        '''
+        load pdf and return document object
+        '''
+        loader=PyPDFLoader(self.file_path)
+        documents=loader.load()
+        return documents
+    def split_documents(self,documents):
+        '''
+        Split Document into chunks
+        '''
+        text_splitter=RecursiveCharacterTextSplitter(
+            chunk_size=500,
+            chunk_overlap=50
+        )
+        chunks=text_splitter.split_documents(documents)
+        return chunks
+    def ingests(self):
+        '''Pipeline'''
+        docs=self.load_documents()
+        chunks=self.split_documents(docs)
+        return chunks

Src/llm/generator.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from langchain_groq import ChatGroq
+from langchain_core.messages import HumanMessage
+from dotenv import load_dotenv
+import os
+class Generator:
+    def __init__(self,temperature:float=0.14):
+        groq_key = os.getenv("GROQ_API_KEY")
+        print("GROQ KEY FOUND:", bool(groq_key))
+        self.llm=ChatGroq(
+            api_key='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',
+            model="llama-3.3-70b-versatile",
+            temperature=temperature
+            )
+    def build_prompt(self,query:str,context:str,chat_history:str):
+        '''
+        Build Prompt With Context + Question'''
+        prompt=f'''
+      You are intelligent Assistant
+Use the document context and conversation history only to answer the user's question.
+Rules:
+1. Prefer the document context for document-related questions.
+2. Use chat history for conversation-related questions like:
+   - "what was my last question?"
+   - "what did you answer before?"
+3. If the answer is not available in either the context or the chat history, say:
+   "I don't know based on the given context."
+      Conversation History:
+      {chat_history}
+      context:
+      {context}
+      Current question:
+      {query}
+       If the answer is not in the context,say:
+       "I Dont Know Based On The Given Context"
+'''
+        return prompt
+    def generate(self,query:str,context:str,chat_history:str=""):
+        '''Generate Answer Using Llm'''
+        prompt=self.build_prompt(query,context,chat_history)
+        response=self.llm.invoke(prompt)
+        return response.content

Src/llm/test.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from Src.llm.generator import Generator
+gen=Generator()
+context='Transformers use attention mechanism'
+query='why do transformer use'
+answer=gen.generate(query,context)
+print(answer)

Src/pipeline/__pycache__/rag_pipeline.cpython-313.pyc ADDED Viewed

Binary file (3.03 kB). View file

Src/pipeline/rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from Src.ingestion.data_loader import DataIngestion
+from Src.embeddings.embedder import Embedder
+from Src.vectorstore.faiss_store import FAISSSSTORE
+from Src.retrieval.retriever import Retriever
+from Src.llm.generator import Generator
+class RAGPipeline:
+    def __init__(self,file_path:str):
+        self.file_path=file_path
+        self.embedder=Embedder()
+        self.generator=Generator()
+        self.faiss_store=FAISSSSTORE(self.embedder.embedding_model)
+        self.chat_memory=[]
+    def build_index(self):
+        ingestion=DataIngestion(self.file_path)
+        chunks=ingestion.ingests()
+        self.faiss_store.create_vector_store(chunks)
+        self.faiss_store.save_vector_store()
+        return 'Vector Store Created And Saved Succesfully'
+    def load_index(self):
+        '''
+        Load Saved Vector Store From Disk'''
+        self.faiss_store.load_vector_store()
+        return 'Vector Store Loaded Succesfully'
+    def get_chat_history(self,limit:int=3):
+        '''
+        Return last Few Conv Turns as Text'''
+        history=self.chat_memory[-limit]
+        formatted_history=''
+        for i,item in enumerate(history,1):
+            formatted_history+=(
+                f"Turn {i}:\n"
+                f"user: {item['question']}\n"
+                f"Assistant: {item['answer']}\n\n"
+            )
+    def ask(self,query:str,k:int=3):
+        """Full RAG flow:
+        query -> retrieve context -> generate answer
+        """
+        retriever=Retriever(self.faiss_store.vector_store)
+        context=retriever.retrieve(query,k=k)
+        answer=self.generator.generate(query,context)
+        return answer

Src/retrieval/__pycache__/retriever.cpython-312.pyc ADDED Viewed

Binary file (1.57 kB). View file

Src/retrieval/__pycache__/retriever.cpython-313.pyc ADDED Viewed

Binary file (1.65 kB). View file

Src/retrieval/retriever.py ADDED Viewed

	@@ -0,0 +1,27 @@

+class Retriever:
+    def __init__(self,vector_store):
+        self.vector_store=vector_store
+    def get_relevant_documents(self,query:str,k:int=5):
+        '''
+        Retrieve Top K Relevant Document'''
+        results=self.vector_store.similarity_search(query,k=k)
+        return results
+    def format_context(self,documents):
+        '''
+        Convert Documents into a single context string'''
+        context = ""
+        for i, doc in enumerate(documents):
+           context += f"[Chunk {i+1}]\n{doc.page_content}\n\n"
+        return context
+    def retrieve(self,query:str,k:int=3):
+        '''
+        Full Retrieval Pipeline'''
+        docs=self.get_relevant_documents(query,k)
+        context=self.format_context(docs)
+        return context

Src/retrieval/test.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from Src.embeddings.embedder import Embedder
+from Src.vectorstore.faiss_store import FAISSStore
+from Src.retrieval.retriever import Retriever
+#Load Vector store
+embedder=Embedder()
+faiss_store=FAISSStore(embedder.embedding_model)
+faiss_store.load_vector_store()
+retriever=Retriever(faiss_store.vector_store)
+query='What Is The Main Idea Of Document'
+context=retriever.retrieve(query)

Src/vectorstore/__pycache__/faiss_store.cpython-313.pyc ADDED Viewed

Binary file (2.47 kB). View file

Src/vectorstore/faiss_store.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+from langchain_community.vectorstores import FAISS
+class FAISSSSTORE:
+    def __init__(self,embedding_model):
+        self.embedding_model=embedding_model
+        self.vector_store=None
+    def create_vector_store(self,chunks):
+        '''
+        Create FAISS vector store from document chunks
+        '''
+        self.vector_store=FAISS.from_documents(
+            documents=chunks,
+            embedding=self.embedding_model
+        )
+        return self.vector_store
+    def save_vector_store(self,folder_path:str='artifacts/faiss_index'):
+        '''
+        Save Faoiss index Locally
+        '''
+        if self.vector_store is None:
+            raise ValueError('Vector Has Not Been Created yet')
+        os.makedirs(folder_path,exist_ok=True)
+        self.vector_store.save_local(folder_path)
+    def load_vector_store(self,folder_path:str='artifacts/faiss_index'):
+        '''
+        Load Faiss index from local storage
+        '''
+        self.vector_store = FAISS.load_local(
+            folder_path=folder_path,
+            embeddings=self.embedding_model,
+            allow_dangerous_deserialization=True
+        )
+        return self.vector_store
+    def similarity_search(self,query:str,k:int=3):
+        '''
+        Search Similar Chunk Of Query
+        '''
+        if self.vector_store is None:
+            raise ValueError('Vector Store is Not loaded or Created yet')
+        results=self.vector_store.similarity_search(query,k=k)
+        return results

Src/vectorstore/test.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from Src.ingestion.data_loader import DataIngestion
+from Src.embeddings.embedder import Embedder
+from Src.vectorstore.faiss_store import FAISSStore
+# Step 1: Load and chunk documents
+ingestion = DataIngestion("")
+chunks = ingestion.ingest()
+# Step 2: Load embedding model
+embedder = Embedder()
+# Step 3: Create vector store
+faiss_store = FAISSStore(embedder.embedding_model)
+faiss_store.create_vector_store(chunks)
+# Step 4: Search
+results = faiss_store.similarity_search("What is the main topic of the document?", k=2)
+for i, doc in enumerate(results, 1):
+    print(f"\nResult {i}:")
+    print(doc.page_content[:500])
+    print("-" * 50)

app.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import os
+import uuid
+import tempfile
+from flask import Flask, request, jsonify, send_from_directory
+from flask_cors import CORS
+from Src.pipeline.rag_pipeline import RAGPipeline
+from dotenv import load_dotenv
+load_dotenv()
+app = Flask(
+    __name__,
+    static_folder='frontend/dist/client',
+    static_url_path=""
+)
+CORS(app, resources={r"/api/*": {"origins": "*"}})
+sessions: dict = {}
+@app.route("/api/upload", methods=["POST"])
+def upload_pdf():
+    # 1. Validate file is in the request
+    if "file" not in request.files:
+        return jsonify({"error": "No file provided. Field name must be 'file'."}), 400
+    file = request.files["file"]
+    if not file.filename.lower().endswith(".pdf"):
+        return jsonify({"error": "Only PDF files are supported."}), 400
+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
+    try:
+        file.save(tmp.name)
+        tmp.close()
+        pipeline = RAGPipeline(tmp.name)
+        result = pipeline.build_index()
+        print(f"[Upload] {result} | file: {file.filename}")
+    except Exception as e:
+        print(f"[Upload ERROR] {e}")
+        return jsonify({"error": f"Failed to process PDF: {str(e)}"}), 500
+    finally:
+        if os.path.exists(tmp.name):
+            os.unlink(tmp.name)
+    session_id = str(uuid.uuid4())
+    sessions[session_id] = {
+        "pipeline": pipeline,
+        "filename": file.filename
+    }
+    print(f"[Upload] Session created → {session_id}")
+    return jsonify({
+        "message": f"'{file.filename}' processed successfully.",
+        "session_id": session_id
+    }), 200
+@app.route("/api/chat", methods=["POST"])
+@app.route("/api/chat", methods=["POST"])
+def chat():
+    data = request.get_json()
+    if not data:
+        return jsonify({"error": "Request body must be JSON."}), 400
+    question = data.get("question", "").strip()
+    session_id = data.get("session_id", "").strip()
+    if not question:
+        return jsonify({"error": "Question is required."}), 400
+    if not session_id:
+        return jsonify({"error": "Session ID is required."}), 400
+    session = sessions.get(session_id)
+    if not session:
+        return jsonify({
+            "error": "Session not found. Please upload PDF again."
+        }), 404
+    try:
+        pipeline = session["pipeline"]
+        answer = pipeline.ask(question)
+        print(f"[Chat] Q: {question}")
+        print(f"[Chat] A: {answer}")
+        return jsonify({
+            "answer": answer
+        }), 200
+    except Exception as e:
+        import traceback
+        print("\n========== CHAT ERROR ==========")
+        traceback.print_exc()
+        print("================================\n")
+        return jsonify({
+            "error": str(e)
+        }), 500
+@app.route("/", defaults={"path": ""})
+@app.route("/<path:path>")
+def serve_react(path):
+    full_path = os.path.join(app.static_folder, path)
+    if path and os.path.exists(full_path):
+        return send_from_directory(app.static_folder, path)
+    return send_from_directory(app.static_folder, "index.html")
+if __name__ == "__main__":
+    print("\n DocuMind AI — Server Starting")
+    print("=" * 45)
+    print("  Login     →  http://localhost:5000/login.html")
+    print("  Register  →  http://localhost:5000/register.html")
+    print("  App       →  http://localhost:5000")
+    print("  Upload    →  POST /api/upload")
+    print("  Chat      →  POST /api/chat")
+    print("=" * 45)
+    os.makedirs("artifacts/faiss_index", exist_ok=True)
+    port = int(os.environ.get("PORT", 7860))
+    app.run(host="0.0.0.0", port=port)

dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.11
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["python", "app.py"]