Spaces:
Running
Running
Upload 8 files
Browse files- .env +5 -0
- __init__.py +0 -0
- chunker.py +31 -0
- groq_llm.py +42 -0
- main.py +60 -0
- parser.py +15 -0
- requirements.txt +36 -0
- retriever.py +79 -0
.env
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
GROQ_API_KEY=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
|
| 2 |
+
PINECONE_API_KEY=pcsk_5BuB2j_JspVPM6YSmS1FC7uUAM7mc6jkd3X9HxvWihUuJv1nkit4hwpF1rR55pSzy2Eu5g
|
| 3 |
+
PINECONE_INDEX_NAME=doc-index
|
| 4 |
+
PORT=10000
|
| 5 |
+
PINECONE_REGION=us-east-1
|
__init__.py
ADDED
|
File without changes
|
chunker.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def chunk_text(text, max_tokens=300, max_chunks=10):
|
| 2 |
+
import re
|
| 3 |
+
sentences = re.split(r'(?<=[.!?])\s+', text) # Better sentence splitting
|
| 4 |
+
|
| 5 |
+
chunks = []
|
| 6 |
+
current_chunk = []
|
| 7 |
+
current_len = 0
|
| 8 |
+
|
| 9 |
+
for sentence in sentences:
|
| 10 |
+
words = sentence.split()
|
| 11 |
+
if not words:
|
| 12 |
+
continue
|
| 13 |
+
if current_len + len(words) <= max_tokens:
|
| 14 |
+
current_chunk.extend(words)
|
| 15 |
+
current_len += len(words)
|
| 16 |
+
else:
|
| 17 |
+
chunk = " ".join(current_chunk).strip()
|
| 18 |
+
if chunk:
|
| 19 |
+
chunks.append(chunk)
|
| 20 |
+
if len(chunks) >= max_chunks:
|
| 21 |
+
break
|
| 22 |
+
current_chunk = words
|
| 23 |
+
current_len = len(words)
|
| 24 |
+
|
| 25 |
+
# Add the last chunk
|
| 26 |
+
if current_chunk and len(chunks) < max_chunks:
|
| 27 |
+
chunk = " ".join(current_chunk).strip()
|
| 28 |
+
if chunk:
|
| 29 |
+
chunks.append(chunk)
|
| 30 |
+
|
| 31 |
+
return chunks
|
groq_llm.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
def truncate_context(context, max_words=800):
|
| 8 |
+
words = context.split()
|
| 9 |
+
return " ".join(words[:max_words])
|
| 10 |
+
|
| 11 |
+
def query_groq_llm(context, question):
|
| 12 |
+
api_key = os.getenv("GROQ_API_KEY")
|
| 13 |
+
if not api_key:
|
| 14 |
+
return "GROQ LLM error: GROQ_API_KEY is not set in environment variables"
|
| 15 |
+
|
| 16 |
+
context = truncate_context(context)
|
| 17 |
+
|
| 18 |
+
headers = {
|
| 19 |
+
"Authorization": f"Bearer {api_key}",
|
| 20 |
+
"Content-Type": "application/json"
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
data = {
|
| 24 |
+
"model": "llama3-8b-8192", # Smaller model
|
| 25 |
+
"messages": [
|
| 26 |
+
{"role": "system", "content": "You are an intelligent assistant."},
|
| 27 |
+
{"role": "user", "content": f"Use the following context to answer the question.\n\nContext:\n{context}\n\nQuestion:\n{question}"}
|
| 28 |
+
],
|
| 29 |
+
"temperature": 0.3, # Reduce hallucination & memory use
|
| 30 |
+
"max_tokens": 150 # Lowered to limit output size
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
response = requests.post(
|
| 35 |
+
"https://api.groq.com/openai/v1/chat/completions",
|
| 36 |
+
headers=headers,
|
| 37 |
+
json=data
|
| 38 |
+
)
|
| 39 |
+
response.raise_for_status()
|
| 40 |
+
return response.json()["choices"][0]["message"]["content"].strip()
|
| 41 |
+
except Exception as e:
|
| 42 |
+
return f"GROQ LLM error: {str(e)}"
|
main.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, UploadFile, Form, File
|
| 2 |
+
from fastapi.responses import JSONResponse
|
| 3 |
+
from app.parser import extract_text_from_pdf
|
| 4 |
+
from app.chunker import chunk_text
|
| 5 |
+
from app.retriever import store_chunks_in_pinecone, query_chunks_from_pinecone
|
| 6 |
+
from app.groq_llm import query_groq_llm
|
| 7 |
+
|
| 8 |
+
import uuid
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
import logging
|
| 11 |
+
|
| 12 |
+
load_dotenv()
|
| 13 |
+
app = FastAPI()
|
| 14 |
+
|
| 15 |
+
logging.basicConfig(level=logging.INFO)
|
| 16 |
+
|
| 17 |
+
@app.post("/run")
|
| 18 |
+
async def run_query(file: UploadFile = File(...), question: str = Form(...)):
|
| 19 |
+
try:
|
| 20 |
+
logging.info("📥 Received file and question: %s", question)
|
| 21 |
+
|
| 22 |
+
file_bytes = await file.read()
|
| 23 |
+
raw_text = extract_text_from_pdf(file_bytes)
|
| 24 |
+
logging.info("📝 Extracted %d characters of text", len(raw_text))
|
| 25 |
+
|
| 26 |
+
if not raw_text.strip():
|
| 27 |
+
return JSONResponse(content={"error": "No extractable text found in PDF."}, status_code=400)
|
| 28 |
+
|
| 29 |
+
chunks = chunk_text(raw_text)
|
| 30 |
+
logging.info("✂️ Generated %d chunks", len(chunks))
|
| 31 |
+
|
| 32 |
+
if not chunks:
|
| 33 |
+
return JSONResponse(content={"error": "Failed to generate any chunks from text."}, status_code=400)
|
| 34 |
+
|
| 35 |
+
file_id = str(uuid.uuid4())
|
| 36 |
+
store_chunks_in_pinecone(chunks, file_id)
|
| 37 |
+
logging.info("📦 Stored chunks in Pinecone with file_id: %s", file_id)
|
| 38 |
+
|
| 39 |
+
top_chunks = query_chunks_from_pinecone(question)
|
| 40 |
+
logging.info("🔍 Retrieved %d top matching chunks", len(top_chunks))
|
| 41 |
+
|
| 42 |
+
if not top_chunks:
|
| 43 |
+
return JSONResponse(content={"error": "No relevant context found."}, status_code=400)
|
| 44 |
+
|
| 45 |
+
context = " ".join(top_chunks[:2])
|
| 46 |
+
answer = query_groq_llm(context, question)
|
| 47 |
+
|
| 48 |
+
return {
|
| 49 |
+
"question": question,
|
| 50 |
+
"context_used": top_chunks[:2],
|
| 51 |
+
"answer": answer
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
except Exception as e:
|
| 55 |
+
logging.exception("❌ Error during /run endpoint:")
|
| 56 |
+
return JSONResponse(content={"error": str(e)}, status_code=500)
|
| 57 |
+
|
| 58 |
+
@app.get("/")
|
| 59 |
+
def read_root():
|
| 60 |
+
return {"message": "✅ LLM PDF QA API is running. Visit /docs to test."}
|
parser.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pypdf import PdfReader
|
| 2 |
+
import io
|
| 3 |
+
|
| 4 |
+
def extract_text_from_pdf(file_bytes: bytes, max_pages: int = 20):
|
| 5 |
+
reader = PdfReader(io.BytesIO(file_bytes))
|
| 6 |
+
text_chunks = []
|
| 7 |
+
|
| 8 |
+
for i, page in enumerate(reader.pages):
|
| 9 |
+
if i >= max_pages:
|
| 10 |
+
break # Stop early to limit memory use
|
| 11 |
+
text = page.extract_text()
|
| 12 |
+
if text:
|
| 13 |
+
text_chunks.append(text)
|
| 14 |
+
|
| 15 |
+
return "\n".join(text_chunks)
|
requirements.txt
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
annotated-types==0.7.0
|
| 2 |
+
anyio==4.9.0
|
| 3 |
+
certifi==2025.7.14
|
| 4 |
+
charset-normalizer==3.4.2
|
| 5 |
+
click==8.2.1
|
| 6 |
+
colorama==0.4.6
|
| 7 |
+
fastapi==0.116.1
|
| 8 |
+
filelock==3.18.0
|
| 9 |
+
fsspec==2025.7.0
|
| 10 |
+
h11==0.16.0
|
| 11 |
+
huggingface-hub==0.34.3
|
| 12 |
+
idna==3.10
|
| 13 |
+
Jinja2==3.1.6
|
| 14 |
+
joblib==1.5.1
|
| 15 |
+
MarkupSafe==3.0.2
|
| 16 |
+
networkx==3.5
|
| 17 |
+
numpy==2.3.2
|
| 18 |
+
packaging==24.2
|
| 19 |
+
pinecone==7.3.0
|
| 20 |
+
pydantic==2.11.7
|
| 21 |
+
pydantic_core==2.33.2
|
| 22 |
+
pypdf==5.9.0
|
| 23 |
+
python-dateutil==2.9.0.post0
|
| 24 |
+
python-dotenv==1.1.1
|
| 25 |
+
python-multipart==0.0.20
|
| 26 |
+
requests==2.32.4
|
| 27 |
+
scikit-learn==1.7.1
|
| 28 |
+
sentence-transformers==5.0.0
|
| 29 |
+
sniffio==1.3.1
|
| 30 |
+
starlette==0.47.2
|
| 31 |
+
threadpoolctl==3.6.0
|
| 32 |
+
typing-inspection==0.4.1
|
| 33 |
+
typing_extensions==4.14.1
|
| 34 |
+
urllib3==2.5.0
|
| 35 |
+
uvicorn==0.35.0
|
| 36 |
+
|
retriever.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from pinecone import Pinecone, ServerlessSpec
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
_index = None
|
| 9 |
+
_pc_client = None
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def get_embedder():
|
| 13 |
+
# Load the embedder only when needed
|
| 14 |
+
try:
|
| 15 |
+
return SentenceTransformer("paraphrase-MiniLM-L3-v2") # small 384-dim model
|
| 16 |
+
except Exception as e:
|
| 17 |
+
raise RuntimeError(f"❌ Failed to load embedder: {e}")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def get_index():
|
| 21 |
+
global _index, _pc_client
|
| 22 |
+
if _index is None:
|
| 23 |
+
try:
|
| 24 |
+
index_name = os.getenv("PINECONE_INDEX_NAME")
|
| 25 |
+
if not index_name:
|
| 26 |
+
raise ValueError("❌ Pinecone index name not set in environment variables.")
|
| 27 |
+
|
| 28 |
+
_pc_client = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
| 29 |
+
|
| 30 |
+
if index_name not in _pc_client.list_indexes().names():
|
| 31 |
+
_pc_client.create_index(
|
| 32 |
+
name=index_name,
|
| 33 |
+
dimension=384,
|
| 34 |
+
metric="cosine",
|
| 35 |
+
spec=ServerlessSpec(
|
| 36 |
+
cloud="aws",
|
| 37 |
+
region=os.getenv("PINECONE_REGION", "us-west-2")
|
| 38 |
+
)
|
| 39 |
+
)
|
| 40 |
+
_index = _pc_client.Index(index_name)
|
| 41 |
+
except Exception as e:
|
| 42 |
+
raise RuntimeError(f"❌ Pinecone index not ready or does not exist: {e}")
|
| 43 |
+
return _index
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def store_chunks_in_pinecone(chunks, file_id):
|
| 47 |
+
try:
|
| 48 |
+
index = get_index()
|
| 49 |
+
for i, chunk in enumerate(chunks):
|
| 50 |
+
try:
|
| 51 |
+
embedder = get_embedder()
|
| 52 |
+
vec = embedder.encode(chunk).tolist()
|
| 53 |
+
|
| 54 |
+
# Upsert each vector immediately to avoid memory buildup
|
| 55 |
+
index.upsert(vectors=[{
|
| 56 |
+
"id": f"{file_id}-{i}",
|
| 57 |
+
"values": vec,
|
| 58 |
+
"metadata": {"text": chunk}
|
| 59 |
+
}])
|
| 60 |
+
|
| 61 |
+
del embedder # Free memory
|
| 62 |
+
except Exception as e:
|
| 63 |
+
print(f"⚠️ Skipping chunk {i} due to error: {e}")
|
| 64 |
+
except Exception as e:
|
| 65 |
+
print(f"❌ Initialization error: {e}")
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def query_chunks_from_pinecone(query, top_k=3):
|
| 69 |
+
try:
|
| 70 |
+
index = get_index()
|
| 71 |
+
embedder = get_embedder()
|
| 72 |
+
query_vec = embedder.encode(query).tolist()
|
| 73 |
+
del embedder # Free memory after encoding
|
| 74 |
+
|
| 75 |
+
results = index.query(vector=query_vec, top_k=top_k, include_metadata=True)
|
| 76 |
+
return [match["metadata"]["text"] for match in results.get("matches", [])]
|
| 77 |
+
except Exception as e:
|
| 78 |
+
print(f"❌ Query error: {e}")
|
| 79 |
+
return []
|