Spaces:

pluto90
/

Smart-Notes-backend

Running

App Files Files Community

pluto90 commited on Apr 7

Commit

20a8e92

verified ·

1 Parent(s): a38b306

Upload 5 files

Browse files

Files changed (5) hide show

app/core/config.py +11 -0
app/core/embedding_engine.py +66 -0
app/core/llm_engine.py +56 -0
app/core/mongo.py +10 -0
app/core/pdf_processor.py +52 -0

app/core/config.py CHANGED Viewed

	@@ -0,0 +1,11 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+QDRANT_URL = os.getenv("QDRANT_URL")
+QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
+MONGO_URI = os.getenv("MONGO_URI")
+MONGO_DB = os.getenv("MONGO_DB", "smartnotes")

app/core/embedding_engine.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# embedding_engine.py
+import uuid
+from qdrant_client import QdrantClient, models
+from qdrant_client.http.models import Distance, VectorParams
+from sentence_transformers import SentenceTransformer
+from app.core.config import QDRANT_URL, QDRANT_API_KEY
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+qdrant = QdrantClient(
+    url=QDRANT_URL,
+    api_key=QDRANT_API_KEY,
+    check_compatibility=False
+    )
+COLLECTION_NAME = "smartnotes"
+BATCH_SIZE = 100
+def ensure_collection():
+    collections = qdrant.get_collections().collections
+    if COLLECTION_NAME not in [c.name for c in collections]:
+        qdrant.create_collection(
+            collection_name=COLLECTION_NAME,
+            vectors_config=VectorParams(
+                size=384,
+                distance=Distance.COSINE
+            ),
+        )
+            # ✅ Add this part
+    qdrant.create_payload_index(
+        collection_name=COLLECTION_NAME,
+        field_name="doc_id",
+        field_schema="keyword"
+    )
+def embed_and_store(text_chunks, doc_id):
+    """Embed chunks and store them in Qdrant efficiently."""
+    ensure_collection()
+    print(f"🔹 Embedding {len(text_chunks)} chunks...")
+    # Generate embeddings
+    vectors = embedder.encode(text_chunks, show_progress_bar=True).tolist()
+    # Prepare points
+    points = [
+        models.PointStruct(
+            id=str(uuid.uuid4()),
+            vector=vectors[i],
+            payload={"doc_id": doc_id, "text": text_chunks[i]},
+        )
+        for i in range(len(vectors))
+    ]
+    # ✅ Upsert in small batches to avoid timeouts
+    print("🔹 Uploading to Qdrant in batches...")
+    for i in range(0, len(points), BATCH_SIZE):
+        batch = points[i:i + BATCH_SIZE]
+        qdrant.upsert(collection_name=COLLECTION_NAME, points=batch)
+        print(f"   → Uploaded batch {i // BATCH_SIZE + 1}/{len(points) // BATCH_SIZE + 1}")
+    print("✅ All embeddings stored successfully!")

app/core/llm_engine.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# llm_engine.py
+import google.generativeai as genai
+from app.core.config import GEMINI_API_KEY
+from langchain_core.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_google_genai import ChatGoogleGenerativeAI
+# ✅ Configure Gemini client
+genai.configure(api_key=GEMINI_API_KEY)
+def ask_gemini(context: str, question: str) -> str:
+    """
+    Ask Gemini a question based on document context using LangChain for better formatting and control.
+    """
+    try:
+        # ✅ Initialize Gemini LLM via LangChain
+        llm = ChatGoogleGenerativeAI(
+            model="gemini-2.5-flash",
+            google_api_key=GEMINI_API_KEY,
+            temperature=0.4,
+            max_output_tokens=2048,
+            convert_system_message_to_human=True
+        )
+        # ✅ Define a structured, formatting-rich prompt
+        prompt = PromptTemplate(
+            input_variables=["context", "question"],
+            template=(
+                "You are an intelligent document assistant.\n"
+                "Answer the user's question strictly using the provided context.\n"
+                "Respond in **clean Markdown formatting** with:\n"
+                "- Headings (##)\n"
+                "- Bullet points and numbered lists\n"
+                "- **Bold keywords**\n"
+                "- Tables (if useful)\n"
+                "- Code blocks when necessary\n"
+                "- Proper spacing and paragraphs for readability\n\n"
+                "### 📄 Document Context:\n{context}\n\n"
+                "### 💬 User Question:\n{question}\n\n"
+                "### 🧠 Answer:"
+            )
+        )
+        # ✅ Combine the prompt, model, and parser (modern LCEL chain)
+        chain = prompt | llm | StrOutputParser()
+        # ✅ Run the chain
+        response = chain.invoke({"context": context, "question": question})
+        return response.strip() if response else "⚠️ No response from Gemini."
+    except Exception as e:
+        return f"⚠️ Gemini (LangChain) error: {str(e)}"

app/core/mongo.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from pymongo import MongoClient
+import os
+MONGO_URI = os.getenv("MONGO_URI")
+DB_NAME = "pdf_chat_db"
+client = MongoClient(MONGO_URI)
+db = client[DB_NAME]
+conversations = db["conversations"]

app/core/pdf_processor.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# pdf_preprocessor.py
+import os
+from pypdf import PdfReader
+from pdf2image import convert_from_path
+import pytesseract
+# Optional: Set Tesseract path manually on Windows
+# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
+def extract_text_from_pdf(file_path: str) -> str:
+    """
+    Extract text from both text-based and image-based PDFs.
+    Falls back to OCR using pytesseract if no embedded text is found.
+    """
+    text_output = []
+    reader = PdfReader(file_path)
+    total_pages = len(reader.pages)
+    print(f"📄 Processing PDF: {file_path} ({total_pages} pages)")
+    for page_num, page in enumerate(reader.pages, start=1):
+        try:
+            # Try normal text extraction
+            extracted_text = page.extract_text()
+            if extracted_text and extracted_text.strip():
+                text_output.append(extracted_text)
+                print(f"✅ Page {page_num}: Extracted embedded text.")
+            else:
+                # Run OCR if no text found
+                print(f"🔍 Page {page_num}: No text found, running OCR...")
+                images = convert_from_path(
+                    file_path, first_page=page_num, last_page=page_num
+                )
+                ocr_text = ""
+                for img in images:
+                    ocr_text += pytesseract.image_to_string(img, lang="eng", config="--psm 6")
+                if ocr_text.strip():
+                    text_output.append(ocr_text)
+                    print(f"🧠 Page {page_num}: OCR extraction complete.")
+                else:
+                    print(f"⚠️ Page {page_num}: OCR found no readable text.")
+        except Exception as e:
+            print(f"❌ Error processing page {page_num}: {e}")
+    full_text = "\n".join(text_output)
+    if not full_text.strip():
+        print("⚠️ Warning: No text extracted from this PDF at all.")
+    else:
+        print(f"✅ Done! Extracted {len(full_text.split())} words total.")
+    return full_text