Dpshkh commited on
Commit
beba6d9
·
verified ·
1 Parent(s): 655c9b5

Upload 8 files

Browse files
Files changed (8) hide show
  1. .env +5 -0
  2. __init__.py +0 -0
  3. chunker.py +31 -0
  4. groq_llm.py +42 -0
  5. main.py +60 -0
  6. parser.py +15 -0
  7. requirements.txt +36 -0
  8. retriever.py +79 -0
.env ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ GROQ_API_KEY=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
2
+ PINECONE_API_KEY=pcsk_5BuB2j_JspVPM6YSmS1FC7uUAM7mc6jkd3X9HxvWihUuJv1nkit4hwpF1rR55pSzy2Eu5g
3
+ PINECONE_INDEX_NAME=doc-index
4
+ PORT=10000
5
+ PINECONE_REGION=us-east-1
__init__.py ADDED
File without changes
chunker.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def chunk_text(text, max_tokens=300, max_chunks=10):
2
+ import re
3
+ sentences = re.split(r'(?<=[.!?])\s+', text) # Better sentence splitting
4
+
5
+ chunks = []
6
+ current_chunk = []
7
+ current_len = 0
8
+
9
+ for sentence in sentences:
10
+ words = sentence.split()
11
+ if not words:
12
+ continue
13
+ if current_len + len(words) <= max_tokens:
14
+ current_chunk.extend(words)
15
+ current_len += len(words)
16
+ else:
17
+ chunk = " ".join(current_chunk).strip()
18
+ if chunk:
19
+ chunks.append(chunk)
20
+ if len(chunks) >= max_chunks:
21
+ break
22
+ current_chunk = words
23
+ current_len = len(words)
24
+
25
+ # Add the last chunk
26
+ if current_chunk and len(chunks) < max_chunks:
27
+ chunk = " ".join(current_chunk).strip()
28
+ if chunk:
29
+ chunks.append(chunk)
30
+
31
+ return chunks
groq_llm.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ def truncate_context(context, max_words=800):
8
+ words = context.split()
9
+ return " ".join(words[:max_words])
10
+
11
+ def query_groq_llm(context, question):
12
+ api_key = os.getenv("GROQ_API_KEY")
13
+ if not api_key:
14
+ return "GROQ LLM error: GROQ_API_KEY is not set in environment variables"
15
+
16
+ context = truncate_context(context)
17
+
18
+ headers = {
19
+ "Authorization": f"Bearer {api_key}",
20
+ "Content-Type": "application/json"
21
+ }
22
+
23
+ data = {
24
+ "model": "llama3-8b-8192", # Smaller model
25
+ "messages": [
26
+ {"role": "system", "content": "You are an intelligent assistant."},
27
+ {"role": "user", "content": f"Use the following context to answer the question.\n\nContext:\n{context}\n\nQuestion:\n{question}"}
28
+ ],
29
+ "temperature": 0.3, # Reduce hallucination & memory use
30
+ "max_tokens": 150 # Lowered to limit output size
31
+ }
32
+
33
+ try:
34
+ response = requests.post(
35
+ "https://api.groq.com/openai/v1/chat/completions",
36
+ headers=headers,
37
+ json=data
38
+ )
39
+ response.raise_for_status()
40
+ return response.json()["choices"][0]["message"]["content"].strip()
41
+ except Exception as e:
42
+ return f"GROQ LLM error: {str(e)}"
main.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, Form, File
2
+ from fastapi.responses import JSONResponse
3
+ from app.parser import extract_text_from_pdf
4
+ from app.chunker import chunk_text
5
+ from app.retriever import store_chunks_in_pinecone, query_chunks_from_pinecone
6
+ from app.groq_llm import query_groq_llm
7
+
8
+ import uuid
9
+ from dotenv import load_dotenv
10
+ import logging
11
+
12
+ load_dotenv()
13
+ app = FastAPI()
14
+
15
+ logging.basicConfig(level=logging.INFO)
16
+
17
+ @app.post("/run")
18
+ async def run_query(file: UploadFile = File(...), question: str = Form(...)):
19
+ try:
20
+ logging.info("📥 Received file and question: %s", question)
21
+
22
+ file_bytes = await file.read()
23
+ raw_text = extract_text_from_pdf(file_bytes)
24
+ logging.info("📝 Extracted %d characters of text", len(raw_text))
25
+
26
+ if not raw_text.strip():
27
+ return JSONResponse(content={"error": "No extractable text found in PDF."}, status_code=400)
28
+
29
+ chunks = chunk_text(raw_text)
30
+ logging.info("✂️ Generated %d chunks", len(chunks))
31
+
32
+ if not chunks:
33
+ return JSONResponse(content={"error": "Failed to generate any chunks from text."}, status_code=400)
34
+
35
+ file_id = str(uuid.uuid4())
36
+ store_chunks_in_pinecone(chunks, file_id)
37
+ logging.info("📦 Stored chunks in Pinecone with file_id: %s", file_id)
38
+
39
+ top_chunks = query_chunks_from_pinecone(question)
40
+ logging.info("🔍 Retrieved %d top matching chunks", len(top_chunks))
41
+
42
+ if not top_chunks:
43
+ return JSONResponse(content={"error": "No relevant context found."}, status_code=400)
44
+
45
+ context = " ".join(top_chunks[:2])
46
+ answer = query_groq_llm(context, question)
47
+
48
+ return {
49
+ "question": question,
50
+ "context_used": top_chunks[:2],
51
+ "answer": answer
52
+ }
53
+
54
+ except Exception as e:
55
+ logging.exception("❌ Error during /run endpoint:")
56
+ return JSONResponse(content={"error": str(e)}, status_code=500)
57
+
58
+ @app.get("/")
59
+ def read_root():
60
+ return {"message": "✅ LLM PDF QA API is running. Visit /docs to test."}
parser.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pypdf import PdfReader
2
+ import io
3
+
4
+ def extract_text_from_pdf(file_bytes: bytes, max_pages: int = 20):
5
+ reader = PdfReader(io.BytesIO(file_bytes))
6
+ text_chunks = []
7
+
8
+ for i, page in enumerate(reader.pages):
9
+ if i >= max_pages:
10
+ break # Stop early to limit memory use
11
+ text = page.extract_text()
12
+ if text:
13
+ text_chunks.append(text)
14
+
15
+ return "\n".join(text_chunks)
requirements.txt ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ annotated-types==0.7.0
2
+ anyio==4.9.0
3
+ certifi==2025.7.14
4
+ charset-normalizer==3.4.2
5
+ click==8.2.1
6
+ colorama==0.4.6
7
+ fastapi==0.116.1
8
+ filelock==3.18.0
9
+ fsspec==2025.7.0
10
+ h11==0.16.0
11
+ huggingface-hub==0.34.3
12
+ idna==3.10
13
+ Jinja2==3.1.6
14
+ joblib==1.5.1
15
+ MarkupSafe==3.0.2
16
+ networkx==3.5
17
+ numpy==2.3.2
18
+ packaging==24.2
19
+ pinecone==7.3.0
20
+ pydantic==2.11.7
21
+ pydantic_core==2.33.2
22
+ pypdf==5.9.0
23
+ python-dateutil==2.9.0.post0
24
+ python-dotenv==1.1.1
25
+ python-multipart==0.0.20
26
+ requests==2.32.4
27
+ scikit-learn==1.7.1
28
+ sentence-transformers==5.0.0
29
+ sniffio==1.3.1
30
+ starlette==0.47.2
31
+ threadpoolctl==3.6.0
32
+ typing-inspection==0.4.1
33
+ typing_extensions==4.14.1
34
+ urllib3==2.5.0
35
+ uvicorn==0.35.0
36
+
retriever.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from pinecone import Pinecone, ServerlessSpec
4
+ from sentence_transformers import SentenceTransformer
5
+
6
+ load_dotenv()
7
+
8
+ _index = None
9
+ _pc_client = None
10
+
11
+
12
+ def get_embedder():
13
+ # Load the embedder only when needed
14
+ try:
15
+ return SentenceTransformer("paraphrase-MiniLM-L3-v2") # small 384-dim model
16
+ except Exception as e:
17
+ raise RuntimeError(f"❌ Failed to load embedder: {e}")
18
+
19
+
20
+ def get_index():
21
+ global _index, _pc_client
22
+ if _index is None:
23
+ try:
24
+ index_name = os.getenv("PINECONE_INDEX_NAME")
25
+ if not index_name:
26
+ raise ValueError("❌ Pinecone index name not set in environment variables.")
27
+
28
+ _pc_client = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
29
+
30
+ if index_name not in _pc_client.list_indexes().names():
31
+ _pc_client.create_index(
32
+ name=index_name,
33
+ dimension=384,
34
+ metric="cosine",
35
+ spec=ServerlessSpec(
36
+ cloud="aws",
37
+ region=os.getenv("PINECONE_REGION", "us-west-2")
38
+ )
39
+ )
40
+ _index = _pc_client.Index(index_name)
41
+ except Exception as e:
42
+ raise RuntimeError(f"❌ Pinecone index not ready or does not exist: {e}")
43
+ return _index
44
+
45
+
46
+ def store_chunks_in_pinecone(chunks, file_id):
47
+ try:
48
+ index = get_index()
49
+ for i, chunk in enumerate(chunks):
50
+ try:
51
+ embedder = get_embedder()
52
+ vec = embedder.encode(chunk).tolist()
53
+
54
+ # Upsert each vector immediately to avoid memory buildup
55
+ index.upsert(vectors=[{
56
+ "id": f"{file_id}-{i}",
57
+ "values": vec,
58
+ "metadata": {"text": chunk}
59
+ }])
60
+
61
+ del embedder # Free memory
62
+ except Exception as e:
63
+ print(f"⚠️ Skipping chunk {i} due to error: {e}")
64
+ except Exception as e:
65
+ print(f"❌ Initialization error: {e}")
66
+
67
+
68
+ def query_chunks_from_pinecone(query, top_k=3):
69
+ try:
70
+ index = get_index()
71
+ embedder = get_embedder()
72
+ query_vec = embedder.encode(query).tolist()
73
+ del embedder # Free memory after encoding
74
+
75
+ results = index.query(vector=query_vec, top_k=top_k, include_metadata=True)
76
+ return [match["metadata"]["text"] for match in results.get("matches", [])]
77
+ except Exception as e:
78
+ print(f"❌ Query error: {e}")
79
+ return []