kanhacoderx commited on
Commit
6e39c64
Β·
verified Β·
1 Parent(s): 47ddaf6

Upload 19 files

Browse files
.gitignore ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================
2
+ # Python
3
+ # =========================
4
+ __pycache__/
5
+ *.py[cod]
6
+ *.pyo
7
+ *.pyd
8
+ *.so
9
+ *.egg
10
+ *.egg-info/
11
+ dist/
12
+ build/
13
+
14
+ # Virtual Environment
15
+ venv/
16
+ .venv/
17
+ env/
18
+
19
+ # Environment Variables
20
+ .env
21
+
22
+ # Jupyter
23
+ .ipynb_checkpoints/
24
+
25
+ # Logs
26
+ *.log
27
+
28
+ # FAISS / Vector DB
29
+ artifacts/
30
+ faiss_index/
31
+ *.faiss
32
+ *.pkl
33
+
34
+ # Model Cache
35
+ .cache/
36
+ huggingface/
37
+ transformers_cache/
38
+
39
+ # OS Files
40
+ .DS_Store
41
+ Thumbs.db
42
+
43
+ # =========================
44
+ # Node / React / Vite
45
+ # =========================
46
+ node_modules/
47
+ frontend/node_modules/
48
+
49
+ # Vite Build
50
+ frontend/dist/
51
+ dist/
52
+
53
+ # Vercel
54
+ .vercel/
55
+
56
+ # npm/yarn
57
+ npm-debug.log*
58
+ yarn-debug.log*
59
+ yarn-error.log*
60
+
61
+ # =========================
62
+ # IDE
63
+ # =========================
64
+ .vscode/
65
+ .idea/
66
+
67
+ # =========================
68
+ # Temporary Files
69
+ # =========================
70
+ temp/
71
+ tmp/
72
+ *.tmp
Src/embeddings/__pycache__/embedder.cpython-313.pyc ADDED
Binary file (1.49 kB). View file
 
Src/embeddings/embedder.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_huggingface import HuggingFaceEmbeddings
2
+
3
+ class Embedder:
4
+ def __init__(self,model_name:str='sentence-transformers/all-MiniLM-L6-v2'):
5
+ self.model_name=model_name
6
+ self.embedding_model=self.load_model()
7
+
8
+ def load_model(self):
9
+ '''
10
+ Load Embedding Model
11
+ '''
12
+
13
+ model=HuggingFaceEmbeddings(
14
+ model_name=self.model_name
15
+ )
16
+
17
+ return model
18
+
19
+ def embed_documents(self,documents):
20
+ '''
21
+ Conbert Documents Into Embedding
22
+ '''
23
+ return self.embedding_model.embed_documents(documents)
24
+ def embed_query(self,query:str):
25
+
26
+ return self.embedding_model.embed_query(query)
Src/embeddings/test.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from Src.embeddings.embedder import Embedder
2
+
3
+ embedder = Embedder()
4
+
5
+ text = ["Machine learning is amazing"]
6
+ vector = embedder.embed_documents(text)
7
+
8
+ print(len(vector[0])) # vector dimension
Src/ingestion/__pycache__/data_loader.cpython-313.pyc ADDED
Binary file (1.69 kB). View file
 
Src/ingestion/data_loader.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ from langchain_community.document_loaders import PyPDFLoader
3
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
4
+ from dataclasses import dataclass
5
+ #Create DataIngestionConfig
6
+
7
+
8
+ class DataIngestion:
9
+ def __init__(self,file_path:str):
10
+ self.file_path=file_path
11
+
12
+ def load_documents(self):
13
+ '''
14
+ load pdf and return document object
15
+ '''
16
+ loader=PyPDFLoader(self.file_path)
17
+ documents=loader.load()
18
+ return documents
19
+
20
+ def split_documents(self,documents):
21
+ '''
22
+ Split Document into chunks
23
+ '''
24
+ text_splitter=RecursiveCharacterTextSplitter(
25
+ chunk_size=500,
26
+ chunk_overlap=50
27
+ )
28
+
29
+ chunks=text_splitter.split_documents(documents)
30
+ return chunks
31
+ def ingests(self):
32
+ '''Pipeline'''
33
+
34
+ docs=self.load_documents()
35
+ chunks=self.split_documents(docs)
36
+ return chunks
Src/llm/generator.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_groq import ChatGroq
2
+ from langchain_core.messages import HumanMessage
3
+ from dotenv import load_dotenv
4
+ import os
5
+
6
+ class Generator:
7
+ def __init__(self,temperature:float=0.14):
8
+ groq_key = os.getenv("GROQ_API_KEY")
9
+ print("GROQ KEY FOUND:", bool(groq_key))
10
+ self.llm=ChatGroq(
11
+ api_key='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',
12
+ model="llama-3.3-70b-versatile",
13
+ temperature=temperature
14
+ )
15
+
16
+ def build_prompt(self,query:str,context:str,chat_history:str):
17
+ '''
18
+ Build Prompt With Context + Question'''
19
+
20
+ prompt=f'''
21
+ You are intelligent Assistant
22
+ Use the document context and conversation history only to answer the user's question.
23
+
24
+ Rules:
25
+ 1. Prefer the document context for document-related questions.
26
+ 2. Use chat history for conversation-related questions like:
27
+ - "what was my last question?"
28
+ - "what did you answer before?"
29
+ 3. If the answer is not available in either the context or the chat history, say:
30
+ "I don't know based on the given context."
31
+ Conversation History:
32
+ {chat_history}
33
+
34
+ context:
35
+ {context}
36
+
37
+ Current question:
38
+ {query}
39
+
40
+ If the answer is not in the context,say:
41
+ "I Dont Know Based On The Given Context"
42
+
43
+
44
+ '''
45
+ return prompt
46
+
47
+ def generate(self,query:str,context:str,chat_history:str=""):
48
+ '''Generate Answer Using Llm'''
49
+
50
+ prompt=self.build_prompt(query,context,chat_history)
51
+
52
+ response=self.llm.invoke(prompt)
53
+
54
+ return response.content
Src/llm/test.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Src.llm.generator import Generator
2
+
3
+ gen=Generator()
4
+
5
+ context='Transformers use attention mechanism'
6
+
7
+ query='why do transformer use'
8
+
9
+ answer=gen.generate(query,context)
10
+
11
+ print(answer)
Src/pipeline/__pycache__/rag_pipeline.cpython-313.pyc ADDED
Binary file (3.03 kB). View file
 
Src/pipeline/rag_pipeline.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Src.ingestion.data_loader import DataIngestion
2
+ from Src.embeddings.embedder import Embedder
3
+ from Src.vectorstore.faiss_store import FAISSSSTORE
4
+ from Src.retrieval.retriever import Retriever
5
+ from Src.llm.generator import Generator
6
+
7
+ class RAGPipeline:
8
+ def __init__(self,file_path:str):
9
+ self.file_path=file_path
10
+ self.embedder=Embedder()
11
+ self.generator=Generator()
12
+ self.faiss_store=FAISSSSTORE(self.embedder.embedding_model)
13
+ self.chat_memory=[]
14
+
15
+ def build_index(self):
16
+ ingestion=DataIngestion(self.file_path)
17
+ chunks=ingestion.ingests()
18
+
19
+ self.faiss_store.create_vector_store(chunks)
20
+ self.faiss_store.save_vector_store()
21
+
22
+ return 'Vector Store Created And Saved Succesfully'
23
+
24
+ def load_index(self):
25
+ '''
26
+ Load Saved Vector Store From Disk'''
27
+
28
+ self.faiss_store.load_vector_store()
29
+
30
+ return 'Vector Store Loaded Succesfully'
31
+
32
+ def get_chat_history(self,limit:int=3):
33
+ '''
34
+ Return last Few Conv Turns as Text'''
35
+ history=self.chat_memory[-limit]
36
+ formatted_history=''
37
+
38
+ for i,item in enumerate(history,1):
39
+ formatted_history+=(
40
+ f"Turn {i}:\n"
41
+ f"user: {item['question']}\n"
42
+ f"Assistant: {item['answer']}\n\n"
43
+ )
44
+
45
+
46
+ def ask(self,query:str,k:int=3):
47
+ """Full RAG flow:
48
+ query -> retrieve context -> generate answer
49
+ """
50
+ retriever=Retriever(self.faiss_store.vector_store)
51
+ context=retriever.retrieve(query,k=k)
52
+
53
+ answer=self.generator.generate(query,context)
54
+ return answer
Src/retrieval/__pycache__/retriever.cpython-312.pyc ADDED
Binary file (1.57 kB). View file
 
Src/retrieval/__pycache__/retriever.cpython-313.pyc ADDED
Binary file (1.65 kB). View file
 
Src/retrieval/retriever.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ class Retriever:
3
+ def __init__(self,vector_store):
4
+ self.vector_store=vector_store
5
+
6
+ def get_relevant_documents(self,query:str,k:int=5):
7
+ '''
8
+ Retrieve Top K Relevant Document'''
9
+
10
+ results=self.vector_store.similarity_search(query,k=k)
11
+
12
+ return results
13
+
14
+ def format_context(self,documents):
15
+ '''
16
+ Convert Documents into a single context string'''
17
+ context = ""
18
+ for i, doc in enumerate(documents):
19
+ context += f"[Chunk {i+1}]\n{doc.page_content}\n\n"
20
+ return context
21
+ def retrieve(self,query:str,k:int=3):
22
+ '''
23
+ Full Retrieval Pipeline'''
24
+
25
+ docs=self.get_relevant_documents(query,k)
26
+ context=self.format_context(docs)
27
+ return context
Src/retrieval/test.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Src.embeddings.embedder import Embedder
2
+ from Src.vectorstore.faiss_store import FAISSStore
3
+ from Src.retrieval.retriever import Retriever
4
+ #Load Vector store
5
+
6
+ embedder=Embedder()
7
+ faiss_store=FAISSStore(embedder.embedding_model)
8
+ faiss_store.load_vector_store()
9
+
10
+ retriever=Retriever(faiss_store.vector_store)
11
+
12
+ query='What Is The Main Idea Of Document'
13
+ context=retriever.retrieve(query)
Src/vectorstore/__pycache__/faiss_store.cpython-313.pyc ADDED
Binary file (2.47 kB). View file
 
Src/vectorstore/faiss_store.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain_community.vectorstores import FAISS
3
+
4
+ class FAISSSSTORE:
5
+ def __init__(self,embedding_model):
6
+ self.embedding_model=embedding_model
7
+ self.vector_store=None
8
+
9
+ def create_vector_store(self,chunks):
10
+ '''
11
+ Create FAISS vector store from document chunks
12
+ '''
13
+
14
+ self.vector_store=FAISS.from_documents(
15
+ documents=chunks,
16
+ embedding=self.embedding_model
17
+ )
18
+ return self.vector_store
19
+
20
+ def save_vector_store(self,folder_path:str='artifacts/faiss_index'):
21
+ '''
22
+ Save Faoiss index Locally
23
+ '''
24
+
25
+ if self.vector_store is None:
26
+ raise ValueError('Vector Has Not Been Created yet')
27
+ os.makedirs(folder_path,exist_ok=True)
28
+ self.vector_store.save_local(folder_path)
29
+ def load_vector_store(self,folder_path:str='artifacts/faiss_index'):
30
+ '''
31
+ Load Faiss index from local storage
32
+ '''
33
+ self.vector_store = FAISS.load_local(
34
+ folder_path=folder_path,
35
+ embeddings=self.embedding_model,
36
+ allow_dangerous_deserialization=True
37
+ )
38
+ return self.vector_store
39
+
40
+
41
+ def similarity_search(self,query:str,k:int=3):
42
+ '''
43
+ Search Similar Chunk Of Query
44
+ '''
45
+
46
+ if self.vector_store is None:
47
+ raise ValueError('Vector Store is Not loaded or Created yet')
48
+
49
+ results=self.vector_store.similarity_search(query,k=k)
50
+ return results
Src/vectorstore/test.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Src.ingestion.data_loader import DataIngestion
2
+ from Src.embeddings.embedder import Embedder
3
+ from Src.vectorstore.faiss_store import FAISSStore
4
+
5
+ # Step 1: Load and chunk documents
6
+ ingestion = DataIngestion("")
7
+ chunks = ingestion.ingest()
8
+
9
+ # Step 2: Load embedding model
10
+ embedder = Embedder()
11
+
12
+ # Step 3: Create vector store
13
+ faiss_store = FAISSStore(embedder.embedding_model)
14
+ faiss_store.create_vector_store(chunks)
15
+
16
+ # Step 4: Search
17
+ results = faiss_store.similarity_search("What is the main topic of the document?", k=2)
18
+
19
+ for i, doc in enumerate(results, 1):
20
+ print(f"\nResult {i}:")
21
+ print(doc.page_content[:500])
22
+ print("-" * 50)
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import tempfile
4
+
5
+ from flask import Flask, request, jsonify, send_from_directory
6
+ from flask_cors import CORS
7
+
8
+ from Src.pipeline.rag_pipeline import RAGPipeline
9
+ from dotenv import load_dotenv
10
+
11
+
12
+ load_dotenv()
13
+ app = Flask(
14
+ __name__,
15
+ static_folder='frontend/dist/client',
16
+ static_url_path=""
17
+ )
18
+ CORS(app, resources={r"/api/*": {"origins": "*"}})
19
+
20
+
21
+ sessions: dict = {}
22
+
23
+
24
+ @app.route("/api/upload", methods=["POST"])
25
+ def upload_pdf():
26
+
27
+ # 1. Validate file is in the request
28
+ if "file" not in request.files:
29
+ return jsonify({"error": "No file provided. Field name must be 'file'."}), 400
30
+
31
+ file = request.files["file"]
32
+
33
+ if not file.filename.lower().endswith(".pdf"):
34
+ return jsonify({"error": "Only PDF files are supported."}), 400
35
+
36
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
37
+ try:
38
+ file.save(tmp.name)
39
+ tmp.close()
40
+
41
+ pipeline = RAGPipeline(tmp.name)
42
+ result = pipeline.build_index()
43
+ print(f"[Upload] {result} | file: {file.filename}")
44
+
45
+ except Exception as e:
46
+ print(f"[Upload ERROR] {e}")
47
+ return jsonify({"error": f"Failed to process PDF: {str(e)}"}), 500
48
+
49
+ finally:
50
+
51
+ if os.path.exists(tmp.name):
52
+ os.unlink(tmp.name)
53
+
54
+ session_id = str(uuid.uuid4())
55
+ sessions[session_id] = {
56
+ "pipeline": pipeline,
57
+ "filename": file.filename
58
+ }
59
+
60
+ print(f"[Upload] Session created β†’ {session_id}")
61
+
62
+ return jsonify({
63
+ "message": f"'{file.filename}' processed successfully.",
64
+ "session_id": session_id
65
+ }), 200
66
+
67
+
68
+
69
+ @app.route("/api/chat", methods=["POST"])
70
+ @app.route("/api/chat", methods=["POST"])
71
+ def chat():
72
+
73
+ data = request.get_json()
74
+
75
+ if not data:
76
+ return jsonify({"error": "Request body must be JSON."}), 400
77
+
78
+ question = data.get("question", "").strip()
79
+ session_id = data.get("session_id", "").strip()
80
+
81
+ if not question:
82
+ return jsonify({"error": "Question is required."}), 400
83
+
84
+ if not session_id:
85
+ return jsonify({"error": "Session ID is required."}), 400
86
+
87
+ session = sessions.get(session_id)
88
+
89
+ if not session:
90
+ return jsonify({
91
+ "error": "Session not found. Please upload PDF again."
92
+ }), 404
93
+
94
+ try:
95
+
96
+ pipeline = session["pipeline"]
97
+
98
+ answer = pipeline.ask(question)
99
+
100
+ print(f"[Chat] Q: {question}")
101
+ print(f"[Chat] A: {answer}")
102
+
103
+ return jsonify({
104
+ "answer": answer
105
+ }), 200
106
+
107
+ except Exception as e:
108
+
109
+ import traceback
110
+
111
+ print("\n========== CHAT ERROR ==========")
112
+ traceback.print_exc()
113
+ print("================================\n")
114
+
115
+ return jsonify({
116
+ "error": str(e)
117
+ }), 500
118
+
119
+ @app.route("/", defaults={"path": ""})
120
+ @app.route("/<path:path>")
121
+ def serve_react(path):
122
+ full_path = os.path.join(app.static_folder, path)
123
+ if path and os.path.exists(full_path):
124
+ return send_from_directory(app.static_folder, path)
125
+ return send_from_directory(app.static_folder, "index.html")
126
+
127
+
128
+ if __name__ == "__main__":
129
+ print("\n DocuMind AI β€” Server Starting")
130
+ print("=" * 45)
131
+ print(" Login β†’ http://localhost:5000/login.html")
132
+ print(" Register β†’ http://localhost:5000/register.html")
133
+ print(" App β†’ http://localhost:5000")
134
+ print(" Upload β†’ POST /api/upload")
135
+ print(" Chat β†’ POST /api/chat")
136
+ print("=" * 45)
137
+
138
+ os.makedirs("artifacts/faiss_index", exist_ok=True)
139
+
140
+ port = int(os.environ.get("PORT", 7860))
141
+ app.run(host="0.0.0.0", port=port)
dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ COPY . .
10
+
11
+ EXPOSE 7860
12
+
13
+ CMD ["python", "app.py"]