pluto90 commited on
Commit
20a8e92
Β·
verified Β·
1 Parent(s): a38b306

Upload 5 files

Browse files
app/core/config.py CHANGED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+
6
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
7
+ QDRANT_URL = os.getenv("QDRANT_URL")
8
+ QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
9
+
10
+ MONGO_URI = os.getenv("MONGO_URI")
11
+ MONGO_DB = os.getenv("MONGO_DB", "smartnotes")
app/core/embedding_engine.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # embedding_engine.py
2
+
3
+ import uuid
4
+ from qdrant_client import QdrantClient, models
5
+ from qdrant_client.http.models import Distance, VectorParams
6
+ from sentence_transformers import SentenceTransformer
7
+ from app.core.config import QDRANT_URL, QDRANT_API_KEY
8
+
9
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
10
+
11
+ qdrant = QdrantClient(
12
+ url=QDRANT_URL,
13
+ api_key=QDRANT_API_KEY,
14
+ check_compatibility=False
15
+ )
16
+
17
+ COLLECTION_NAME = "smartnotes"
18
+ BATCH_SIZE = 100
19
+
20
+
21
+ def ensure_collection():
22
+ collections = qdrant.get_collections().collections
23
+ if COLLECTION_NAME not in [c.name for c in collections]:
24
+ qdrant.create_collection(
25
+ collection_name=COLLECTION_NAME,
26
+ vectors_config=VectorParams(
27
+ size=384,
28
+ distance=Distance.COSINE
29
+ ),
30
+ )
31
+
32
+ # βœ… Add this part
33
+ qdrant.create_payload_index(
34
+ collection_name=COLLECTION_NAME,
35
+ field_name="doc_id",
36
+ field_schema="keyword"
37
+ )
38
+
39
+
40
+
41
+ def embed_and_store(text_chunks, doc_id):
42
+ """Embed chunks and store them in Qdrant efficiently."""
43
+ ensure_collection()
44
+ print(f"πŸ”Ή Embedding {len(text_chunks)} chunks...")
45
+
46
+ # Generate embeddings
47
+ vectors = embedder.encode(text_chunks, show_progress_bar=True).tolist()
48
+
49
+ # Prepare points
50
+ points = [
51
+ models.PointStruct(
52
+ id=str(uuid.uuid4()),
53
+ vector=vectors[i],
54
+ payload={"doc_id": doc_id, "text": text_chunks[i]},
55
+ )
56
+ for i in range(len(vectors))
57
+ ]
58
+
59
+ # βœ… Upsert in small batches to avoid timeouts
60
+ print("πŸ”Ή Uploading to Qdrant in batches...")
61
+ for i in range(0, len(points), BATCH_SIZE):
62
+ batch = points[i:i + BATCH_SIZE]
63
+ qdrant.upsert(collection_name=COLLECTION_NAME, points=batch)
64
+ print(f" β†’ Uploaded batch {i // BATCH_SIZE + 1}/{len(points) // BATCH_SIZE + 1}")
65
+
66
+ print("βœ… All embeddings stored successfully!")
app/core/llm_engine.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # llm_engine.py
2
+
3
+ import google.generativeai as genai
4
+ from app.core.config import GEMINI_API_KEY
5
+ from langchain_core.prompts import PromptTemplate
6
+ from langchain_core.output_parsers import StrOutputParser
7
+ from langchain_google_genai import ChatGoogleGenerativeAI
8
+
9
+ # βœ… Configure Gemini client
10
+ genai.configure(api_key=GEMINI_API_KEY)
11
+
12
+
13
+ def ask_gemini(context: str, question: str) -> str:
14
+ """
15
+ Ask Gemini a question based on document context using LangChain for better formatting and control.
16
+ """
17
+
18
+ try:
19
+ # βœ… Initialize Gemini LLM via LangChain
20
+ llm = ChatGoogleGenerativeAI(
21
+ model="gemini-2.5-flash",
22
+ google_api_key=GEMINI_API_KEY,
23
+ temperature=0.4,
24
+ max_output_tokens=2048,
25
+ convert_system_message_to_human=True
26
+ )
27
+
28
+ # βœ… Define a structured, formatting-rich prompt
29
+ prompt = PromptTemplate(
30
+ input_variables=["context", "question"],
31
+ template=(
32
+ "You are an intelligent document assistant.\n"
33
+ "Answer the user's question strictly using the provided context.\n"
34
+ "Respond in **clean Markdown formatting** with:\n"
35
+ "- Headings (##)\n"
36
+ "- Bullet points and numbered lists\n"
37
+ "- **Bold keywords**\n"
38
+ "- Tables (if useful)\n"
39
+ "- Code blocks when necessary\n"
40
+ "- Proper spacing and paragraphs for readability\n\n"
41
+ "### πŸ“„ Document Context:\n{context}\n\n"
42
+ "### πŸ’¬ User Question:\n{question}\n\n"
43
+ "### 🧠 Answer:"
44
+ )
45
+ )
46
+
47
+ # βœ… Combine the prompt, model, and parser (modern LCEL chain)
48
+ chain = prompt | llm | StrOutputParser()
49
+
50
+ # βœ… Run the chain
51
+ response = chain.invoke({"context": context, "question": question})
52
+
53
+ return response.strip() if response else "⚠️ No response from Gemini."
54
+
55
+ except Exception as e:
56
+ return f"⚠️ Gemini (LangChain) error: {str(e)}"
app/core/mongo.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymongo import MongoClient
2
+ import os
3
+
4
+ MONGO_URI = os.getenv("MONGO_URI")
5
+ DB_NAME = "pdf_chat_db"
6
+
7
+ client = MongoClient(MONGO_URI)
8
+ db = client[DB_NAME]
9
+
10
+ conversations = db["conversations"]
app/core/pdf_processor.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pdf_preprocessor.py
2
+
3
+ import os
4
+ from pypdf import PdfReader
5
+ from pdf2image import convert_from_path
6
+ import pytesseract
7
+
8
+ # Optional: Set Tesseract path manually on Windows
9
+ # pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
10
+
11
+ def extract_text_from_pdf(file_path: str) -> str:
12
+ """
13
+ Extract text from both text-based and image-based PDFs.
14
+ Falls back to OCR using pytesseract if no embedded text is found.
15
+ """
16
+ text_output = []
17
+ reader = PdfReader(file_path)
18
+ total_pages = len(reader.pages)
19
+
20
+ print(f"πŸ“„ Processing PDF: {file_path} ({total_pages} pages)")
21
+
22
+ for page_num, page in enumerate(reader.pages, start=1):
23
+ try:
24
+ # Try normal text extraction
25
+ extracted_text = page.extract_text()
26
+ if extracted_text and extracted_text.strip():
27
+ text_output.append(extracted_text)
28
+ print(f"βœ… Page {page_num}: Extracted embedded text.")
29
+ else:
30
+ # Run OCR if no text found
31
+ print(f"πŸ” Page {page_num}: No text found, running OCR...")
32
+ images = convert_from_path(
33
+ file_path, first_page=page_num, last_page=page_num
34
+ )
35
+ ocr_text = ""
36
+ for img in images:
37
+ ocr_text += pytesseract.image_to_string(img, lang="eng", config="--psm 6")
38
+ if ocr_text.strip():
39
+ text_output.append(ocr_text)
40
+ print(f"🧠 Page {page_num}: OCR extraction complete.")
41
+ else:
42
+ print(f"⚠️ Page {page_num}: OCR found no readable text.")
43
+ except Exception as e:
44
+ print(f"❌ Error processing page {page_num}: {e}")
45
+
46
+ full_text = "\n".join(text_output)
47
+ if not full_text.strip():
48
+ print("⚠️ Warning: No text extracted from this PDF at all.")
49
+ else:
50
+ print(f"βœ… Done! Extracted {len(full_text.split())} words total.")
51
+
52
+ return full_text