Spaces:
Sleeping
Sleeping
Upload 19 files
Browse files- .gitignore +72 -0
- Src/embeddings/__pycache__/embedder.cpython-313.pyc +0 -0
- Src/embeddings/embedder.py +26 -0
- Src/embeddings/test.py +8 -0
- Src/ingestion/__pycache__/data_loader.cpython-313.pyc +0 -0
- Src/ingestion/data_loader.py +36 -0
- Src/llm/generator.py +54 -0
- Src/llm/test.py +11 -0
- Src/pipeline/__pycache__/rag_pipeline.cpython-313.pyc +0 -0
- Src/pipeline/rag_pipeline.py +54 -0
- Src/retrieval/__pycache__/retriever.cpython-312.pyc +0 -0
- Src/retrieval/__pycache__/retriever.cpython-313.pyc +0 -0
- Src/retrieval/retriever.py +27 -0
- Src/retrieval/test.py +13 -0
- Src/vectorstore/__pycache__/faiss_store.cpython-313.pyc +0 -0
- Src/vectorstore/faiss_store.py +50 -0
- Src/vectorstore/test.py +22 -0
- app.py +141 -0
- dockerfile +13 -0
.gitignore
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =========================
|
| 2 |
+
# Python
|
| 3 |
+
# =========================
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.py[cod]
|
| 6 |
+
*.pyo
|
| 7 |
+
*.pyd
|
| 8 |
+
*.so
|
| 9 |
+
*.egg
|
| 10 |
+
*.egg-info/
|
| 11 |
+
dist/
|
| 12 |
+
build/
|
| 13 |
+
|
| 14 |
+
# Virtual Environment
|
| 15 |
+
venv/
|
| 16 |
+
.venv/
|
| 17 |
+
env/
|
| 18 |
+
|
| 19 |
+
# Environment Variables
|
| 20 |
+
.env
|
| 21 |
+
|
| 22 |
+
# Jupyter
|
| 23 |
+
.ipynb_checkpoints/
|
| 24 |
+
|
| 25 |
+
# Logs
|
| 26 |
+
*.log
|
| 27 |
+
|
| 28 |
+
# FAISS / Vector DB
|
| 29 |
+
artifacts/
|
| 30 |
+
faiss_index/
|
| 31 |
+
*.faiss
|
| 32 |
+
*.pkl
|
| 33 |
+
|
| 34 |
+
# Model Cache
|
| 35 |
+
.cache/
|
| 36 |
+
huggingface/
|
| 37 |
+
transformers_cache/
|
| 38 |
+
|
| 39 |
+
# OS Files
|
| 40 |
+
.DS_Store
|
| 41 |
+
Thumbs.db
|
| 42 |
+
|
| 43 |
+
# =========================
|
| 44 |
+
# Node / React / Vite
|
| 45 |
+
# =========================
|
| 46 |
+
node_modules/
|
| 47 |
+
frontend/node_modules/
|
| 48 |
+
|
| 49 |
+
# Vite Build
|
| 50 |
+
frontend/dist/
|
| 51 |
+
dist/
|
| 52 |
+
|
| 53 |
+
# Vercel
|
| 54 |
+
.vercel/
|
| 55 |
+
|
| 56 |
+
# npm/yarn
|
| 57 |
+
npm-debug.log*
|
| 58 |
+
yarn-debug.log*
|
| 59 |
+
yarn-error.log*
|
| 60 |
+
|
| 61 |
+
# =========================
|
| 62 |
+
# IDE
|
| 63 |
+
# =========================
|
| 64 |
+
.vscode/
|
| 65 |
+
.idea/
|
| 66 |
+
|
| 67 |
+
# =========================
|
| 68 |
+
# Temporary Files
|
| 69 |
+
# =========================
|
| 70 |
+
temp/
|
| 71 |
+
tmp/
|
| 72 |
+
*.tmp
|
Src/embeddings/__pycache__/embedder.cpython-313.pyc
ADDED
|
Binary file (1.49 kB). View file
|
|
|
Src/embeddings/embedder.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 2 |
+
|
| 3 |
+
class Embedder:
|
| 4 |
+
def __init__(self,model_name:str='sentence-transformers/all-MiniLM-L6-v2'):
|
| 5 |
+
self.model_name=model_name
|
| 6 |
+
self.embedding_model=self.load_model()
|
| 7 |
+
|
| 8 |
+
def load_model(self):
|
| 9 |
+
'''
|
| 10 |
+
Load Embedding Model
|
| 11 |
+
'''
|
| 12 |
+
|
| 13 |
+
model=HuggingFaceEmbeddings(
|
| 14 |
+
model_name=self.model_name
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
return model
|
| 18 |
+
|
| 19 |
+
def embed_documents(self,documents):
|
| 20 |
+
'''
|
| 21 |
+
Conbert Documents Into Embedding
|
| 22 |
+
'''
|
| 23 |
+
return self.embedding_model.embed_documents(documents)
|
| 24 |
+
def embed_query(self,query:str):
|
| 25 |
+
|
| 26 |
+
return self.embedding_model.embed_query(query)
|
Src/embeddings/test.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from Src.embeddings.embedder import Embedder
|
| 2 |
+
|
| 3 |
+
embedder = Embedder()
|
| 4 |
+
|
| 5 |
+
text = ["Machine learning is amazing"]
|
| 6 |
+
vector = embedder.embed_documents(text)
|
| 7 |
+
|
| 8 |
+
print(len(vector[0])) # vector dimension
|
Src/ingestion/__pycache__/data_loader.cpython-313.pyc
ADDED
|
Binary file (1.69 kB). View file
|
|
|
Src/ingestion/data_loader.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 2 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 3 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
#Create DataIngestionConfig
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class DataIngestion:
|
| 9 |
+
def __init__(self,file_path:str):
|
| 10 |
+
self.file_path=file_path
|
| 11 |
+
|
| 12 |
+
def load_documents(self):
|
| 13 |
+
'''
|
| 14 |
+
load pdf and return document object
|
| 15 |
+
'''
|
| 16 |
+
loader=PyPDFLoader(self.file_path)
|
| 17 |
+
documents=loader.load()
|
| 18 |
+
return documents
|
| 19 |
+
|
| 20 |
+
def split_documents(self,documents):
|
| 21 |
+
'''
|
| 22 |
+
Split Document into chunks
|
| 23 |
+
'''
|
| 24 |
+
text_splitter=RecursiveCharacterTextSplitter(
|
| 25 |
+
chunk_size=500,
|
| 26 |
+
chunk_overlap=50
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
chunks=text_splitter.split_documents(documents)
|
| 30 |
+
return chunks
|
| 31 |
+
def ingests(self):
|
| 32 |
+
'''Pipeline'''
|
| 33 |
+
|
| 34 |
+
docs=self.load_documents()
|
| 35 |
+
chunks=self.split_documents(docs)
|
| 36 |
+
return chunks
|
Src/llm/generator.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain_groq import ChatGroq
|
| 2 |
+
from langchain_core.messages import HumanMessage
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
class Generator:
|
| 7 |
+
def __init__(self,temperature:float=0.14):
|
| 8 |
+
groq_key = os.getenv("GROQ_API_KEY")
|
| 9 |
+
print("GROQ KEY FOUND:", bool(groq_key))
|
| 10 |
+
self.llm=ChatGroq(
|
| 11 |
+
api_key='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',
|
| 12 |
+
model="llama-3.3-70b-versatile",
|
| 13 |
+
temperature=temperature
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
def build_prompt(self,query:str,context:str,chat_history:str):
|
| 17 |
+
'''
|
| 18 |
+
Build Prompt With Context + Question'''
|
| 19 |
+
|
| 20 |
+
prompt=f'''
|
| 21 |
+
You are intelligent Assistant
|
| 22 |
+
Use the document context and conversation history only to answer the user's question.
|
| 23 |
+
|
| 24 |
+
Rules:
|
| 25 |
+
1. Prefer the document context for document-related questions.
|
| 26 |
+
2. Use chat history for conversation-related questions like:
|
| 27 |
+
- "what was my last question?"
|
| 28 |
+
- "what did you answer before?"
|
| 29 |
+
3. If the answer is not available in either the context or the chat history, say:
|
| 30 |
+
"I don't know based on the given context."
|
| 31 |
+
Conversation History:
|
| 32 |
+
{chat_history}
|
| 33 |
+
|
| 34 |
+
context:
|
| 35 |
+
{context}
|
| 36 |
+
|
| 37 |
+
Current question:
|
| 38 |
+
{query}
|
| 39 |
+
|
| 40 |
+
If the answer is not in the context,say:
|
| 41 |
+
"I Dont Know Based On The Given Context"
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
'''
|
| 45 |
+
return prompt
|
| 46 |
+
|
| 47 |
+
def generate(self,query:str,context:str,chat_history:str=""):
|
| 48 |
+
'''Generate Answer Using Llm'''
|
| 49 |
+
|
| 50 |
+
prompt=self.build_prompt(query,context,chat_history)
|
| 51 |
+
|
| 52 |
+
response=self.llm.invoke(prompt)
|
| 53 |
+
|
| 54 |
+
return response.content
|
Src/llm/test.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from Src.llm.generator import Generator
|
| 2 |
+
|
| 3 |
+
gen=Generator()
|
| 4 |
+
|
| 5 |
+
context='Transformers use attention mechanism'
|
| 6 |
+
|
| 7 |
+
query='why do transformer use'
|
| 8 |
+
|
| 9 |
+
answer=gen.generate(query,context)
|
| 10 |
+
|
| 11 |
+
print(answer)
|
Src/pipeline/__pycache__/rag_pipeline.cpython-313.pyc
ADDED
|
Binary file (3.03 kB). View file
|
|
|
Src/pipeline/rag_pipeline.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from Src.ingestion.data_loader import DataIngestion
|
| 2 |
+
from Src.embeddings.embedder import Embedder
|
| 3 |
+
from Src.vectorstore.faiss_store import FAISSSSTORE
|
| 4 |
+
from Src.retrieval.retriever import Retriever
|
| 5 |
+
from Src.llm.generator import Generator
|
| 6 |
+
|
| 7 |
+
class RAGPipeline:
|
| 8 |
+
def __init__(self,file_path:str):
|
| 9 |
+
self.file_path=file_path
|
| 10 |
+
self.embedder=Embedder()
|
| 11 |
+
self.generator=Generator()
|
| 12 |
+
self.faiss_store=FAISSSSTORE(self.embedder.embedding_model)
|
| 13 |
+
self.chat_memory=[]
|
| 14 |
+
|
| 15 |
+
def build_index(self):
|
| 16 |
+
ingestion=DataIngestion(self.file_path)
|
| 17 |
+
chunks=ingestion.ingests()
|
| 18 |
+
|
| 19 |
+
self.faiss_store.create_vector_store(chunks)
|
| 20 |
+
self.faiss_store.save_vector_store()
|
| 21 |
+
|
| 22 |
+
return 'Vector Store Created And Saved Succesfully'
|
| 23 |
+
|
| 24 |
+
def load_index(self):
|
| 25 |
+
'''
|
| 26 |
+
Load Saved Vector Store From Disk'''
|
| 27 |
+
|
| 28 |
+
self.faiss_store.load_vector_store()
|
| 29 |
+
|
| 30 |
+
return 'Vector Store Loaded Succesfully'
|
| 31 |
+
|
| 32 |
+
def get_chat_history(self,limit:int=3):
|
| 33 |
+
'''
|
| 34 |
+
Return last Few Conv Turns as Text'''
|
| 35 |
+
history=self.chat_memory[-limit]
|
| 36 |
+
formatted_history=''
|
| 37 |
+
|
| 38 |
+
for i,item in enumerate(history,1):
|
| 39 |
+
formatted_history+=(
|
| 40 |
+
f"Turn {i}:\n"
|
| 41 |
+
f"user: {item['question']}\n"
|
| 42 |
+
f"Assistant: {item['answer']}\n\n"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def ask(self,query:str,k:int=3):
|
| 47 |
+
"""Full RAG flow:
|
| 48 |
+
query -> retrieve context -> generate answer
|
| 49 |
+
"""
|
| 50 |
+
retriever=Retriever(self.faiss_store.vector_store)
|
| 51 |
+
context=retriever.retrieve(query,k=k)
|
| 52 |
+
|
| 53 |
+
answer=self.generator.generate(query,context)
|
| 54 |
+
return answer
|
Src/retrieval/__pycache__/retriever.cpython-312.pyc
ADDED
|
Binary file (1.57 kB). View file
|
|
|
Src/retrieval/__pycache__/retriever.cpython-313.pyc
ADDED
|
Binary file (1.65 kB). View file
|
|
|
Src/retrieval/retriever.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
class Retriever:
|
| 3 |
+
def __init__(self,vector_store):
|
| 4 |
+
self.vector_store=vector_store
|
| 5 |
+
|
| 6 |
+
def get_relevant_documents(self,query:str,k:int=5):
|
| 7 |
+
'''
|
| 8 |
+
Retrieve Top K Relevant Document'''
|
| 9 |
+
|
| 10 |
+
results=self.vector_store.similarity_search(query,k=k)
|
| 11 |
+
|
| 12 |
+
return results
|
| 13 |
+
|
| 14 |
+
def format_context(self,documents):
|
| 15 |
+
'''
|
| 16 |
+
Convert Documents into a single context string'''
|
| 17 |
+
context = ""
|
| 18 |
+
for i, doc in enumerate(documents):
|
| 19 |
+
context += f"[Chunk {i+1}]\n{doc.page_content}\n\n"
|
| 20 |
+
return context
|
| 21 |
+
def retrieve(self,query:str,k:int=3):
|
| 22 |
+
'''
|
| 23 |
+
Full Retrieval Pipeline'''
|
| 24 |
+
|
| 25 |
+
docs=self.get_relevant_documents(query,k)
|
| 26 |
+
context=self.format_context(docs)
|
| 27 |
+
return context
|
Src/retrieval/test.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from Src.embeddings.embedder import Embedder
|
| 2 |
+
from Src.vectorstore.faiss_store import FAISSStore
|
| 3 |
+
from Src.retrieval.retriever import Retriever
|
| 4 |
+
#Load Vector store
|
| 5 |
+
|
| 6 |
+
embedder=Embedder()
|
| 7 |
+
faiss_store=FAISSStore(embedder.embedding_model)
|
| 8 |
+
faiss_store.load_vector_store()
|
| 9 |
+
|
| 10 |
+
retriever=Retriever(faiss_store.vector_store)
|
| 11 |
+
|
| 12 |
+
query='What Is The Main Idea Of Document'
|
| 13 |
+
context=retriever.retrieve(query)
|
Src/vectorstore/__pycache__/faiss_store.cpython-313.pyc
ADDED
|
Binary file (2.47 kB). View file
|
|
|
Src/vectorstore/faiss_store.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from langchain_community.vectorstores import FAISS
|
| 3 |
+
|
| 4 |
+
class FAISSSSTORE:
|
| 5 |
+
def __init__(self,embedding_model):
|
| 6 |
+
self.embedding_model=embedding_model
|
| 7 |
+
self.vector_store=None
|
| 8 |
+
|
| 9 |
+
def create_vector_store(self,chunks):
|
| 10 |
+
'''
|
| 11 |
+
Create FAISS vector store from document chunks
|
| 12 |
+
'''
|
| 13 |
+
|
| 14 |
+
self.vector_store=FAISS.from_documents(
|
| 15 |
+
documents=chunks,
|
| 16 |
+
embedding=self.embedding_model
|
| 17 |
+
)
|
| 18 |
+
return self.vector_store
|
| 19 |
+
|
| 20 |
+
def save_vector_store(self,folder_path:str='artifacts/faiss_index'):
|
| 21 |
+
'''
|
| 22 |
+
Save Faoiss index Locally
|
| 23 |
+
'''
|
| 24 |
+
|
| 25 |
+
if self.vector_store is None:
|
| 26 |
+
raise ValueError('Vector Has Not Been Created yet')
|
| 27 |
+
os.makedirs(folder_path,exist_ok=True)
|
| 28 |
+
self.vector_store.save_local(folder_path)
|
| 29 |
+
def load_vector_store(self,folder_path:str='artifacts/faiss_index'):
|
| 30 |
+
'''
|
| 31 |
+
Load Faiss index from local storage
|
| 32 |
+
'''
|
| 33 |
+
self.vector_store = FAISS.load_local(
|
| 34 |
+
folder_path=folder_path,
|
| 35 |
+
embeddings=self.embedding_model,
|
| 36 |
+
allow_dangerous_deserialization=True
|
| 37 |
+
)
|
| 38 |
+
return self.vector_store
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def similarity_search(self,query:str,k:int=3):
|
| 42 |
+
'''
|
| 43 |
+
Search Similar Chunk Of Query
|
| 44 |
+
'''
|
| 45 |
+
|
| 46 |
+
if self.vector_store is None:
|
| 47 |
+
raise ValueError('Vector Store is Not loaded or Created yet')
|
| 48 |
+
|
| 49 |
+
results=self.vector_store.similarity_search(query,k=k)
|
| 50 |
+
return results
|
Src/vectorstore/test.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from Src.ingestion.data_loader import DataIngestion
|
| 2 |
+
from Src.embeddings.embedder import Embedder
|
| 3 |
+
from Src.vectorstore.faiss_store import FAISSStore
|
| 4 |
+
|
| 5 |
+
# Step 1: Load and chunk documents
|
| 6 |
+
ingestion = DataIngestion("")
|
| 7 |
+
chunks = ingestion.ingest()
|
| 8 |
+
|
| 9 |
+
# Step 2: Load embedding model
|
| 10 |
+
embedder = Embedder()
|
| 11 |
+
|
| 12 |
+
# Step 3: Create vector store
|
| 13 |
+
faiss_store = FAISSStore(embedder.embedding_model)
|
| 14 |
+
faiss_store.create_vector_store(chunks)
|
| 15 |
+
|
| 16 |
+
# Step 4: Search
|
| 17 |
+
results = faiss_store.similarity_search("What is the main topic of the document?", k=2)
|
| 18 |
+
|
| 19 |
+
for i, doc in enumerate(results, 1):
|
| 20 |
+
print(f"\nResult {i}:")
|
| 21 |
+
print(doc.page_content[:500])
|
| 22 |
+
print("-" * 50)
|
app.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import uuid
|
| 3 |
+
import tempfile
|
| 4 |
+
|
| 5 |
+
from flask import Flask, request, jsonify, send_from_directory
|
| 6 |
+
from flask_cors import CORS
|
| 7 |
+
|
| 8 |
+
from Src.pipeline.rag_pipeline import RAGPipeline
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
load_dotenv()
|
| 13 |
+
app = Flask(
|
| 14 |
+
__name__,
|
| 15 |
+
static_folder='frontend/dist/client',
|
| 16 |
+
static_url_path=""
|
| 17 |
+
)
|
| 18 |
+
CORS(app, resources={r"/api/*": {"origins": "*"}})
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
sessions: dict = {}
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@app.route("/api/upload", methods=["POST"])
|
| 25 |
+
def upload_pdf():
|
| 26 |
+
|
| 27 |
+
# 1. Validate file is in the request
|
| 28 |
+
if "file" not in request.files:
|
| 29 |
+
return jsonify({"error": "No file provided. Field name must be 'file'."}), 400
|
| 30 |
+
|
| 31 |
+
file = request.files["file"]
|
| 32 |
+
|
| 33 |
+
if not file.filename.lower().endswith(".pdf"):
|
| 34 |
+
return jsonify({"error": "Only PDF files are supported."}), 400
|
| 35 |
+
|
| 36 |
+
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
|
| 37 |
+
try:
|
| 38 |
+
file.save(tmp.name)
|
| 39 |
+
tmp.close()
|
| 40 |
+
|
| 41 |
+
pipeline = RAGPipeline(tmp.name)
|
| 42 |
+
result = pipeline.build_index()
|
| 43 |
+
print(f"[Upload] {result} | file: {file.filename}")
|
| 44 |
+
|
| 45 |
+
except Exception as e:
|
| 46 |
+
print(f"[Upload ERROR] {e}")
|
| 47 |
+
return jsonify({"error": f"Failed to process PDF: {str(e)}"}), 500
|
| 48 |
+
|
| 49 |
+
finally:
|
| 50 |
+
|
| 51 |
+
if os.path.exists(tmp.name):
|
| 52 |
+
os.unlink(tmp.name)
|
| 53 |
+
|
| 54 |
+
session_id = str(uuid.uuid4())
|
| 55 |
+
sessions[session_id] = {
|
| 56 |
+
"pipeline": pipeline,
|
| 57 |
+
"filename": file.filename
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
print(f"[Upload] Session created β {session_id}")
|
| 61 |
+
|
| 62 |
+
return jsonify({
|
| 63 |
+
"message": f"'{file.filename}' processed successfully.",
|
| 64 |
+
"session_id": session_id
|
| 65 |
+
}), 200
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@app.route("/api/chat", methods=["POST"])
|
| 70 |
+
@app.route("/api/chat", methods=["POST"])
|
| 71 |
+
def chat():
|
| 72 |
+
|
| 73 |
+
data = request.get_json()
|
| 74 |
+
|
| 75 |
+
if not data:
|
| 76 |
+
return jsonify({"error": "Request body must be JSON."}), 400
|
| 77 |
+
|
| 78 |
+
question = data.get("question", "").strip()
|
| 79 |
+
session_id = data.get("session_id", "").strip()
|
| 80 |
+
|
| 81 |
+
if not question:
|
| 82 |
+
return jsonify({"error": "Question is required."}), 400
|
| 83 |
+
|
| 84 |
+
if not session_id:
|
| 85 |
+
return jsonify({"error": "Session ID is required."}), 400
|
| 86 |
+
|
| 87 |
+
session = sessions.get(session_id)
|
| 88 |
+
|
| 89 |
+
if not session:
|
| 90 |
+
return jsonify({
|
| 91 |
+
"error": "Session not found. Please upload PDF again."
|
| 92 |
+
}), 404
|
| 93 |
+
|
| 94 |
+
try:
|
| 95 |
+
|
| 96 |
+
pipeline = session["pipeline"]
|
| 97 |
+
|
| 98 |
+
answer = pipeline.ask(question)
|
| 99 |
+
|
| 100 |
+
print(f"[Chat] Q: {question}")
|
| 101 |
+
print(f"[Chat] A: {answer}")
|
| 102 |
+
|
| 103 |
+
return jsonify({
|
| 104 |
+
"answer": answer
|
| 105 |
+
}), 200
|
| 106 |
+
|
| 107 |
+
except Exception as e:
|
| 108 |
+
|
| 109 |
+
import traceback
|
| 110 |
+
|
| 111 |
+
print("\n========== CHAT ERROR ==========")
|
| 112 |
+
traceback.print_exc()
|
| 113 |
+
print("================================\n")
|
| 114 |
+
|
| 115 |
+
return jsonify({
|
| 116 |
+
"error": str(e)
|
| 117 |
+
}), 500
|
| 118 |
+
|
| 119 |
+
@app.route("/", defaults={"path": ""})
|
| 120 |
+
@app.route("/<path:path>")
|
| 121 |
+
def serve_react(path):
|
| 122 |
+
full_path = os.path.join(app.static_folder, path)
|
| 123 |
+
if path and os.path.exists(full_path):
|
| 124 |
+
return send_from_directory(app.static_folder, path)
|
| 125 |
+
return send_from_directory(app.static_folder, "index.html")
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
if __name__ == "__main__":
|
| 129 |
+
print("\n DocuMind AI β Server Starting")
|
| 130 |
+
print("=" * 45)
|
| 131 |
+
print(" Login β http://localhost:5000/login.html")
|
| 132 |
+
print(" Register β http://localhost:5000/register.html")
|
| 133 |
+
print(" App β http://localhost:5000")
|
| 134 |
+
print(" Upload β POST /api/upload")
|
| 135 |
+
print(" Chat β POST /api/chat")
|
| 136 |
+
print("=" * 45)
|
| 137 |
+
|
| 138 |
+
os.makedirs("artifacts/faiss_index", exist_ok=True)
|
| 139 |
+
|
| 140 |
+
port = int(os.environ.get("PORT", 7860))
|
| 141 |
+
app.run(host="0.0.0.0", port=port)
|
dockerfile
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt .
|
| 6 |
+
|
| 7 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 8 |
+
|
| 9 |
+
COPY . .
|
| 10 |
+
|
| 11 |
+
EXPOSE 7860
|
| 12 |
+
|
| 13 |
+
CMD ["python", "app.py"]
|