Spaces:
Running
Running
Upload 5 files
Browse files- app/core/config.py +11 -0
- app/core/embedding_engine.py +66 -0
- app/core/llm_engine.py +56 -0
- app/core/mongo.py +10 -0
- app/core/pdf_processor.py +52 -0
app/core/config.py
CHANGED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
|
| 4 |
+
load_dotenv()
|
| 5 |
+
|
| 6 |
+
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
| 7 |
+
QDRANT_URL = os.getenv("QDRANT_URL")
|
| 8 |
+
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
|
| 9 |
+
|
| 10 |
+
MONGO_URI = os.getenv("MONGO_URI")
|
| 11 |
+
MONGO_DB = os.getenv("MONGO_DB", "smartnotes")
|
app/core/embedding_engine.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# embedding_engine.py
|
| 2 |
+
|
| 3 |
+
import uuid
|
| 4 |
+
from qdrant_client import QdrantClient, models
|
| 5 |
+
from qdrant_client.http.models import Distance, VectorParams
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
from app.core.config import QDRANT_URL, QDRANT_API_KEY
|
| 8 |
+
|
| 9 |
+
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
| 10 |
+
|
| 11 |
+
qdrant = QdrantClient(
|
| 12 |
+
url=QDRANT_URL,
|
| 13 |
+
api_key=QDRANT_API_KEY,
|
| 14 |
+
check_compatibility=False
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
COLLECTION_NAME = "smartnotes"
|
| 18 |
+
BATCH_SIZE = 100
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def ensure_collection():
|
| 22 |
+
collections = qdrant.get_collections().collections
|
| 23 |
+
if COLLECTION_NAME not in [c.name for c in collections]:
|
| 24 |
+
qdrant.create_collection(
|
| 25 |
+
collection_name=COLLECTION_NAME,
|
| 26 |
+
vectors_config=VectorParams(
|
| 27 |
+
size=384,
|
| 28 |
+
distance=Distance.COSINE
|
| 29 |
+
),
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# β
Add this part
|
| 33 |
+
qdrant.create_payload_index(
|
| 34 |
+
collection_name=COLLECTION_NAME,
|
| 35 |
+
field_name="doc_id",
|
| 36 |
+
field_schema="keyword"
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def embed_and_store(text_chunks, doc_id):
|
| 42 |
+
"""Embed chunks and store them in Qdrant efficiently."""
|
| 43 |
+
ensure_collection()
|
| 44 |
+
print(f"πΉ Embedding {len(text_chunks)} chunks...")
|
| 45 |
+
|
| 46 |
+
# Generate embeddings
|
| 47 |
+
vectors = embedder.encode(text_chunks, show_progress_bar=True).tolist()
|
| 48 |
+
|
| 49 |
+
# Prepare points
|
| 50 |
+
points = [
|
| 51 |
+
models.PointStruct(
|
| 52 |
+
id=str(uuid.uuid4()),
|
| 53 |
+
vector=vectors[i],
|
| 54 |
+
payload={"doc_id": doc_id, "text": text_chunks[i]},
|
| 55 |
+
)
|
| 56 |
+
for i in range(len(vectors))
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
# β
Upsert in small batches to avoid timeouts
|
| 60 |
+
print("πΉ Uploading to Qdrant in batches...")
|
| 61 |
+
for i in range(0, len(points), BATCH_SIZE):
|
| 62 |
+
batch = points[i:i + BATCH_SIZE]
|
| 63 |
+
qdrant.upsert(collection_name=COLLECTION_NAME, points=batch)
|
| 64 |
+
print(f" β Uploaded batch {i // BATCH_SIZE + 1}/{len(points) // BATCH_SIZE + 1}")
|
| 65 |
+
|
| 66 |
+
print("β
All embeddings stored successfully!")
|
app/core/llm_engine.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# llm_engine.py
|
| 2 |
+
|
| 3 |
+
import google.generativeai as genai
|
| 4 |
+
from app.core.config import GEMINI_API_KEY
|
| 5 |
+
from langchain_core.prompts import PromptTemplate
|
| 6 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 7 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 8 |
+
|
| 9 |
+
# β
Configure Gemini client
|
| 10 |
+
genai.configure(api_key=GEMINI_API_KEY)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def ask_gemini(context: str, question: str) -> str:
|
| 14 |
+
"""
|
| 15 |
+
Ask Gemini a question based on document context using LangChain for better formatting and control.
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
# β
Initialize Gemini LLM via LangChain
|
| 20 |
+
llm = ChatGoogleGenerativeAI(
|
| 21 |
+
model="gemini-2.5-flash",
|
| 22 |
+
google_api_key=GEMINI_API_KEY,
|
| 23 |
+
temperature=0.4,
|
| 24 |
+
max_output_tokens=2048,
|
| 25 |
+
convert_system_message_to_human=True
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# β
Define a structured, formatting-rich prompt
|
| 29 |
+
prompt = PromptTemplate(
|
| 30 |
+
input_variables=["context", "question"],
|
| 31 |
+
template=(
|
| 32 |
+
"You are an intelligent document assistant.\n"
|
| 33 |
+
"Answer the user's question strictly using the provided context.\n"
|
| 34 |
+
"Respond in **clean Markdown formatting** with:\n"
|
| 35 |
+
"- Headings (##)\n"
|
| 36 |
+
"- Bullet points and numbered lists\n"
|
| 37 |
+
"- **Bold keywords**\n"
|
| 38 |
+
"- Tables (if useful)\n"
|
| 39 |
+
"- Code blocks when necessary\n"
|
| 40 |
+
"- Proper spacing and paragraphs for readability\n\n"
|
| 41 |
+
"### π Document Context:\n{context}\n\n"
|
| 42 |
+
"### π¬ User Question:\n{question}\n\n"
|
| 43 |
+
"### π§ Answer:"
|
| 44 |
+
)
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# β
Combine the prompt, model, and parser (modern LCEL chain)
|
| 48 |
+
chain = prompt | llm | StrOutputParser()
|
| 49 |
+
|
| 50 |
+
# β
Run the chain
|
| 51 |
+
response = chain.invoke({"context": context, "question": question})
|
| 52 |
+
|
| 53 |
+
return response.strip() if response else "β οΈ No response from Gemini."
|
| 54 |
+
|
| 55 |
+
except Exception as e:
|
| 56 |
+
return f"β οΈ Gemini (LangChain) error: {str(e)}"
|
app/core/mongo.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pymongo import MongoClient
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
MONGO_URI = os.getenv("MONGO_URI")
|
| 5 |
+
DB_NAME = "pdf_chat_db"
|
| 6 |
+
|
| 7 |
+
client = MongoClient(MONGO_URI)
|
| 8 |
+
db = client[DB_NAME]
|
| 9 |
+
|
| 10 |
+
conversations = db["conversations"]
|
app/core/pdf_processor.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pdf_preprocessor.py
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from pypdf import PdfReader
|
| 5 |
+
from pdf2image import convert_from_path
|
| 6 |
+
import pytesseract
|
| 7 |
+
|
| 8 |
+
# Optional: Set Tesseract path manually on Windows
|
| 9 |
+
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
| 10 |
+
|
| 11 |
+
def extract_text_from_pdf(file_path: str) -> str:
|
| 12 |
+
"""
|
| 13 |
+
Extract text from both text-based and image-based PDFs.
|
| 14 |
+
Falls back to OCR using pytesseract if no embedded text is found.
|
| 15 |
+
"""
|
| 16 |
+
text_output = []
|
| 17 |
+
reader = PdfReader(file_path)
|
| 18 |
+
total_pages = len(reader.pages)
|
| 19 |
+
|
| 20 |
+
print(f"π Processing PDF: {file_path} ({total_pages} pages)")
|
| 21 |
+
|
| 22 |
+
for page_num, page in enumerate(reader.pages, start=1):
|
| 23 |
+
try:
|
| 24 |
+
# Try normal text extraction
|
| 25 |
+
extracted_text = page.extract_text()
|
| 26 |
+
if extracted_text and extracted_text.strip():
|
| 27 |
+
text_output.append(extracted_text)
|
| 28 |
+
print(f"β
Page {page_num}: Extracted embedded text.")
|
| 29 |
+
else:
|
| 30 |
+
# Run OCR if no text found
|
| 31 |
+
print(f"π Page {page_num}: No text found, running OCR...")
|
| 32 |
+
images = convert_from_path(
|
| 33 |
+
file_path, first_page=page_num, last_page=page_num
|
| 34 |
+
)
|
| 35 |
+
ocr_text = ""
|
| 36 |
+
for img in images:
|
| 37 |
+
ocr_text += pytesseract.image_to_string(img, lang="eng", config="--psm 6")
|
| 38 |
+
if ocr_text.strip():
|
| 39 |
+
text_output.append(ocr_text)
|
| 40 |
+
print(f"π§ Page {page_num}: OCR extraction complete.")
|
| 41 |
+
else:
|
| 42 |
+
print(f"β οΈ Page {page_num}: OCR found no readable text.")
|
| 43 |
+
except Exception as e:
|
| 44 |
+
print(f"β Error processing page {page_num}: {e}")
|
| 45 |
+
|
| 46 |
+
full_text = "\n".join(text_output)
|
| 47 |
+
if not full_text.strip():
|
| 48 |
+
print("β οΈ Warning: No text extracted from this PDF at all.")
|
| 49 |
+
else:
|
| 50 |
+
print(f"β
Done! Extracted {len(full_text.split())} words total.")
|
| 51 |
+
|
| 52 |
+
return full_text
|