Spaces:

Dpshkh
/

pdf

Running

App Files Files Community

Dpshkh commited on Aug 5, 2025

Commit

beba6d9

verified ·

1 Parent(s): 655c9b5

Upload 8 files

Browse files

Files changed (8) hide show

.env +5 -0
__init__.py +0 -0
chunker.py +31 -0
groq_llm.py +42 -0
main.py +60 -0
parser.py +15 -0
requirements.txt +36 -0
retriever.py +79 -0

.env ADDED Viewed

	@@ -0,0 +1,5 @@

+GROQ_API_KEY=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
+PINECONE_API_KEY=pcsk_5BuB2j_JspVPM6YSmS1FC7uUAM7mc6jkd3X9HxvWihUuJv1nkit4hwpF1rR55pSzy2Eu5g
+PINECONE_INDEX_NAME=doc-index
+PORT=10000
+PINECONE_REGION=us-east-1

__init__.py ADDED Viewed

File without changes

chunker.py ADDED Viewed

	@@ -0,0 +1,31 @@

+def chunk_text(text, max_tokens=300, max_chunks=10):
+    import re
+    sentences = re.split(r'(?<=[.!?])\s+', text)  # Better sentence splitting
+    chunks = []
+    current_chunk = []
+    current_len = 0
+    for sentence in sentences:
+        words = sentence.split()
+        if not words:
+            continue
+        if current_len + len(words) <= max_tokens:
+            current_chunk.extend(words)
+            current_len += len(words)
+        else:
+            chunk = " ".join(current_chunk).strip()
+            if chunk:
+                chunks.append(chunk)
+            if len(chunks) >= max_chunks:
+                break
+            current_chunk = words
+            current_len = len(words)
+    # Add the last chunk
+    if current_chunk and len(chunks) < max_chunks:
+        chunk = " ".join(current_chunk).strip()
+        if chunk:
+            chunks.append(chunk)
+    return chunks

groq_llm.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+import requests
+from dotenv import load_dotenv
+load_dotenv()
+def truncate_context(context, max_words=800):
+    words = context.split()
+    return " ".join(words[:max_words])
+def query_groq_llm(context, question):
+    api_key = os.getenv("GROQ_API_KEY")
+    if not api_key:
+        return "GROQ LLM error: GROQ_API_KEY is not set in environment variables"
+    context = truncate_context(context)
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": "llama3-8b-8192",  # Smaller model
+        "messages": [
+            {"role": "system", "content": "You are an intelligent assistant."},
+            {"role": "user", "content": f"Use the following context to answer the question.\n\nContext:\n{context}\n\nQuestion:\n{question}"}
+        ],
+        "temperature": 0.3,       # Reduce hallucination & memory use
+        "max_tokens": 150         # Lowered to limit output size
+    }
+    try:
+        response = requests.post(
+            "https://api.groq.com/openai/v1/chat/completions",
+            headers=headers,
+            json=data
+        )
+        response.raise_for_status()
+        return response.json()["choices"][0]["message"]["content"].strip()
+    except Exception as e:
+        return f"GROQ LLM error: {str(e)}"

main.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from fastapi import FastAPI, UploadFile, Form, File
+from fastapi.responses import JSONResponse
+from app.parser import extract_text_from_pdf
+from app.chunker import chunk_text
+from app.retriever import store_chunks_in_pinecone, query_chunks_from_pinecone
+from app.groq_llm import query_groq_llm
+import uuid
+from dotenv import load_dotenv
+import logging
+load_dotenv()
+app = FastAPI()
+logging.basicConfig(level=logging.INFO)
+@app.post("/run")
+async def run_query(file: UploadFile = File(...), question: str = Form(...)):
+    try:
+        logging.info("📥 Received file and question: %s", question)
+        file_bytes = await file.read()
+        raw_text = extract_text_from_pdf(file_bytes)
+        logging.info("📝 Extracted %d characters of text", len(raw_text))
+        if not raw_text.strip():
+            return JSONResponse(content={"error": "No extractable text found in PDF."}, status_code=400)
+        chunks = chunk_text(raw_text)
+        logging.info("✂️ Generated %d chunks", len(chunks))
+        if not chunks:
+            return JSONResponse(content={"error": "Failed to generate any chunks from text."}, status_code=400)
+        file_id = str(uuid.uuid4())
+        store_chunks_in_pinecone(chunks, file_id)
+        logging.info("📦 Stored chunks in Pinecone with file_id: %s", file_id)
+        top_chunks = query_chunks_from_pinecone(question)
+        logging.info("🔍 Retrieved %d top matching chunks", len(top_chunks))
+        if not top_chunks:
+            return JSONResponse(content={"error": "No relevant context found."}, status_code=400)
+        context = " ".join(top_chunks[:2])
+        answer = query_groq_llm(context, question)
+        return {
+            "question": question,
+            "context_used": top_chunks[:2],
+            "answer": answer
+        }
+    except Exception as e:
+        logging.exception("❌ Error during /run endpoint:")
+        return JSONResponse(content={"error": str(e)}, status_code=500)
+@app.get("/")
+def read_root():
+    return {"message": "✅ LLM PDF QA API is running. Visit /docs to test."}

parser.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from pypdf import PdfReader
+import io
+def extract_text_from_pdf(file_bytes: bytes, max_pages: int = 20):
+    reader = PdfReader(io.BytesIO(file_bytes))
+    text_chunks = []
+    for i, page in enumerate(reader.pages):
+        if i >= max_pages:
+            break  # Stop early to limit memory use
+        text = page.extract_text()
+        if text:
+            text_chunks.append(text)
+    return "\n".join(text_chunks)

requirements.txt ADDED Viewed

	@@ -0,0 +1,36 @@

+annotated-types==0.7.0
+anyio==4.9.0
+certifi==2025.7.14
+charset-normalizer==3.4.2
+click==8.2.1
+colorama==0.4.6
+fastapi==0.116.1
+filelock==3.18.0
+fsspec==2025.7.0
+h11==0.16.0
+huggingface-hub==0.34.3
+idna==3.10
+Jinja2==3.1.6
+joblib==1.5.1
+MarkupSafe==3.0.2
+networkx==3.5
+numpy==2.3.2
+packaging==24.2
+pinecone==7.3.0
+pydantic==2.11.7
+pydantic_core==2.33.2
+pypdf==5.9.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.1
+python-multipart==0.0.20
+requests==2.32.4
+scikit-learn==1.7.1
+sentence-transformers==5.0.0
+sniffio==1.3.1
+starlette==0.47.2
+threadpoolctl==3.6.0
+typing-inspection==0.4.1
+typing_extensions==4.14.1
+urllib3==2.5.0
+uvicorn==0.35.0

retriever.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+from dotenv import load_dotenv
+from pinecone import Pinecone, ServerlessSpec
+from sentence_transformers import SentenceTransformer
+load_dotenv()
+_index = None
+_pc_client = None
+def get_embedder():
+    # Load the embedder only when needed
+    try:
+        return SentenceTransformer("paraphrase-MiniLM-L3-v2")  # small 384-dim model
+    except Exception as e:
+        raise RuntimeError(f"❌ Failed to load embedder: {e}")
+def get_index():
+    global _index, _pc_client
+    if _index is None:
+        try:
+            index_name = os.getenv("PINECONE_INDEX_NAME")
+            if not index_name:
+                raise ValueError("❌ Pinecone index name not set in environment variables.")
+            _pc_client = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
+            if index_name not in _pc_client.list_indexes().names():
+                _pc_client.create_index(
+                    name=index_name,
+                    dimension=384,
+                    metric="cosine",
+                    spec=ServerlessSpec(
+                        cloud="aws",
+                        region=os.getenv("PINECONE_REGION", "us-west-2")
+                    )
+                )
+            _index = _pc_client.Index(index_name)
+        except Exception as e:
+            raise RuntimeError(f"❌ Pinecone index not ready or does not exist: {e}")
+    return _index
+def store_chunks_in_pinecone(chunks, file_id):
+    try:
+        index = get_index()
+        for i, chunk in enumerate(chunks):
+            try:
+                embedder = get_embedder()
+                vec = embedder.encode(chunk).tolist()
+                # Upsert each vector immediately to avoid memory buildup
+                index.upsert(vectors=[{
+                    "id": f"{file_id}-{i}",
+                    "values": vec,
+                    "metadata": {"text": chunk}
+                }])
+                del embedder  # Free memory
+            except Exception as e:
+                print(f"⚠️ Skipping chunk {i} due to error: {e}")
+    except Exception as e:
+        print(f"❌ Initialization error: {e}")
+def query_chunks_from_pinecone(query, top_k=3):
+    try:
+        index = get_index()
+        embedder = get_embedder()
+        query_vec = embedder.encode(query).tolist()
+        del embedder  # Free memory after encoding
+        results = index.query(vector=query_vec, top_k=top_k, include_metadata=True)
+        return [match["metadata"]["text"] for match in results.get("matches", [])]
+    except Exception as e:
+        print(f"❌ Query error: {e}")
+        return []