Prithivi-nanda commited on
Commit
cbaaac0
·
1 Parent(s): bfb07bd

initial commit

Browse files
Files changed (6) hide show
  1. .gitignore +3 -0
  2. Dockerfile +13 -0
  3. main.py +167 -0
  4. requirements.txt +13 -0
  5. test.py +28 -0
  6. utils.py +124 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ venv/
2
+ __pycache__/
3
+ .env
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+
12
+ COPY --chown=user . /app
13
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from fastapi import FastAPI, UploadFile, File, HTTPException
4
+ from fastapi.responses import JSONResponse
5
+ from utils import process_file,embed_text # Assuming your previous code is in utils.py
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ from pydantic import BaseModel
8
+ from pinecone import Pinecone
9
+ from dotenv import load_dotenv
10
+ import requests
11
+ load_dotenv()
12
+
13
+ app = FastAPI(title="Document Embedding Uploader")
14
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
15
+ PINECONE_INDEX = os.getenv("PINECONE_INDEX") or "studybuddy-notes"
16
+ pc = Pinecone(api_key=PINECONE_API_KEY)
17
+ index = pc.Index(PINECONE_INDEX)
18
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
19
+ GROQ_BASE_URL = "https://api.groq.com/openai/v1/chat/completions"
20
+ HEADERS = {
21
+ "Authorization": f"Bearer {GROQ_API_KEY}",
22
+ "Content-Type": "application/json"
23
+ }
24
+
25
+
26
+ # CORS middleware (optional, for testing with frontend)
27
+ app.add_middleware(
28
+ CORSMiddleware,
29
+ allow_origins=["*"],
30
+ allow_credentials=True,
31
+ allow_methods=["*"],
32
+ allow_headers=["*"],
33
+ )
34
+
35
+ UPLOAD_FOLDER = "uploads"
36
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
37
+
38
+ # Supported file types and their extensions
39
+ ALLOWED_EXTENSIONS = {
40
+ "pdf": "pdf",
41
+ "docx": "docx",
42
+ "txt": "txt",
43
+ "md": "md",
44
+ }
45
+
46
+ def get_file_type(filename: str):
47
+ ext = filename.split(".")[-1].lower()
48
+ if ext in ALLOWED_EXTENSIONS.values():
49
+ return ext
50
+ return None
51
+
52
+ @app.post("/upload/")
53
+ async def upload_file(file: UploadFile = File(...)):
54
+ file_type = get_file_type(file.filename)
55
+ if not file_type:
56
+ raise HTTPException(status_code=400, detail="Unsupported file type")
57
+
58
+ file_location = os.path.join(UPLOAD_FOLDER, file.filename)
59
+ with open(file_location, "wb") as buffer:
60
+ shutil.copyfileobj(file.file, buffer)
61
+ file.file.close()
62
+
63
+ try:
64
+ process_file(file_location, file_type)
65
+ except Exception as e:
66
+ return JSONResponse(status_code=500, content={"error": str(e)})
67
+
68
+ return {"message": f"File '{file.filename}' processed and embedded successfully"}
69
+
70
+
71
+ class QueryRequest(BaseModel):
72
+ query: str
73
+
74
+ @app.post("/query/")
75
+ async def query_llm(req: QueryRequest):
76
+ try:
77
+ # Use your existing embed_text function for query embedding
78
+ query_embedding = embed_text(req.query).tolist()
79
+
80
+ # Query Pinecone index
81
+ result = index.query(vector=query_embedding, top_k=5, include_metadata=True)
82
+
83
+ docs = [match.get("metadata", {}).get("text", "") for match in result.get("matches", []) if "metadata" in match]
84
+
85
+ context = "\n\n".join(docs) if docs else "No relevant context found."
86
+
87
+ prompt = (
88
+ f"You are a helpful assistant. Use the following context to answer the question.\n\n"
89
+ f"Context:\n{context}\n\nQuestion: {req.query}\nAnswer:"
90
+ )
91
+
92
+ # Call Groq LLM API
93
+ response = requests.post(
94
+ GROQ_BASE_URL,
95
+ headers=HEADERS,
96
+ json={
97
+ "model": "llama3-70b-8192",
98
+ "messages": [
99
+ {"role": "system", "content": "You are a helpful assistant."},
100
+ {"role": "user", "content": prompt}
101
+ ],
102
+ "max_tokens": 512
103
+ }
104
+ )
105
+ response.raise_for_status()
106
+ answer = response.json()["choices"][0]["message"]["content"].strip()
107
+
108
+ return {"answer": answer}
109
+
110
+ except Exception as e:
111
+ raise HTTPException(status_code=500, detail=str(e))
112
+
113
+ class MindMapRequest(BaseModel):
114
+ query: str
115
+
116
+ @app.post("/generate-mindmap/")
117
+ async def generate_mindmap(req: MindMapRequest):
118
+ prompt = (
119
+ "You are a helpful assistant that creates mind map nodes from the user's query. "
120
+ "Generate output strictly in JSON array format where each node has the following schema:\n\n"
121
+ "{ \n"
122
+ " id: string,\n"
123
+ " label: string,\n"
124
+ " children: string[],\n"
125
+ " explanation?: string,\n"
126
+ " metadata?: { color: string, icon: string },\n"
127
+ " parent_id?: string\n"
128
+ "}\n\n"
129
+ f"User query: \"{req.query}\"\n\n"
130
+ "Please respond ONLY with valid JSON."
131
+ )
132
+
133
+ try:
134
+ response = requests.post(
135
+ GROQ_BASE_URL,
136
+ headers=HEADERS,
137
+ json={
138
+ "model": "llama3-70b-8192",
139
+ "messages": [
140
+ {"role": "system", "content": "You are an expert mind map generator."},
141
+ {"role": "user", "content": prompt}
142
+ ],
143
+ "max_tokens": 1024
144
+ }
145
+ )
146
+ response.raise_for_status()
147
+ content = response.json()["choices"][0]["message"]["content"].strip()
148
+
149
+ # Validate JSON format by parsing (catch errors)
150
+ import json
151
+ mindmap_nodes = json.loads(content)
152
+
153
+ # Optional: Validate schema here or sanitize
154
+
155
+ return mindmap_nodes
156
+
157
+ except requests.HTTPError as http_err:
158
+ raise HTTPException(status_code=response.status_code, detail=f"LLM API error: {http_err}")
159
+ except json.JSONDecodeError:
160
+ raise HTTPException(status_code=500, detail="LLM responded with invalid JSON")
161
+ except Exception as e:
162
+ raise HTTPException(status_code=500, detail=str(e))
163
+
164
+
165
+ @app.get("/")
166
+ def root():
167
+ return {"message": "Document embedding uploader API is running."}
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.116.1
2
+ uvicorn==0.35.0
3
+ PyPDF2==3.0.0
4
+ pdf2image==1.16.3
5
+ pytesseract==0.3.10
6
+ docx2txt==0.8.0
7
+ transformers==4.35.0
8
+ torch==2.1.0
9
+ pinecone==7.3.0
10
+ python-dotenv==1.1.1
11
+ pymupdf==1.26.4
12
+ python-multipart==0.0.20
13
+ "numpy<2"
test.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pinecone import Pinecone,ServerlessSpec
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
7
+ INDEX_NAME = "studybuddy-notes"
8
+ DIMENSION = 384 # Adjust based on your embedding size
9
+
10
+ # Initialize Pinecone client
11
+ pc = Pinecone(api_key=PINECONE_API_KEY)
12
+
13
+ # List existing indexes
14
+ indexes = pc.list_indexes()
15
+ if INDEX_NAME not in indexes:
16
+ pc.create_index(
17
+ name=INDEX_NAME,
18
+ dimension=DIMENSION,
19
+ metric="cosine",
20
+ spec=ServerlessSpec(
21
+ cloud="aws",
22
+ region="us-east-1"
23
+ )
24
+ )
25
+
26
+
27
+
28
+
utils.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from PyPDF2 import PdfReader
3
+ import docx2txt
4
+ from pinecone import Pinecone, ServerlessSpec
5
+ from transformers import AutoTokenizer, AutoModel
6
+ import torch
7
+ from dotenv import load_dotenv
8
+
9
+ load_dotenv()
10
+ # -------- Document Text Extraction --------
11
+
12
+ def extract_text_from_pdf(file_path: str, use_ocr: bool = True) -> str:
13
+ text = ""
14
+ try:
15
+ reader = PdfReader(file_path)
16
+ for page in reader.pages:
17
+ text += page.extract_text() or ""
18
+ except Exception as e:
19
+ print(f"PDF text extraction error: {e}")
20
+
21
+ return text
22
+
23
+ def extract_text_from_docx(file_path: str) -> str:
24
+ try:
25
+ return docx2txt.process(file_path)
26
+ except Exception as e:
27
+ print(f"DOCX extraction error: {e}")
28
+ return ""
29
+
30
+ def extract_text_from_txt(file_path: str) -> str:
31
+ try:
32
+ with open(file_path, "r", encoding="utf-8") as f:
33
+ return f.read()
34
+ except Exception as e:
35
+ print(f"TXT extraction error: {e}")
36
+ return ""
37
+
38
+ def extract_text_from_md(file_path: str) -> str:
39
+ return extract_text_from_txt(file_path)
40
+
41
+ # -------- Hugging Face Embedding Setup --------
42
+
43
+ tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
44
+ model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
45
+ model.eval()
46
+
47
+ def mean_pooling(model_output, attention_mask):
48
+ token_embeddings = model_output.last_hidden_state
49
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
50
+ sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, dim=1)
51
+ sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
52
+ return sum_embeddings / sum_mask
53
+
54
+ def embed_text(text):
55
+ encoded_input = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt')
56
+ with torch.no_grad():
57
+ model_output = model(**encoded_input)
58
+ embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
59
+ normalized_embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
60
+ return normalized_embeddings[0].cpu().numpy()
61
+
62
+ # -------- Pinecone Setup --------
63
+
64
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
65
+ INDEX_NAME = "studybuddy-notes"
66
+ DIMENSION = 384 # Embedding dimension from the model
67
+
68
+ pc = Pinecone(api_key=PINECONE_API_KEY)
69
+ index = pc.Index(INDEX_NAME)
70
+
71
+ # -------- Text Chunking --------
72
+
73
+ def chunk_text(text, chunk_size=500, overlap=100):
74
+ if overlap >= chunk_size:
75
+ raise ValueError("Overlap must be smaller than chunk size")
76
+ chunks = []
77
+ start = 0
78
+ text_length = len(text)
79
+ while start < text_length:
80
+ end = start + chunk_size
81
+ chunks.append(text[start:end])
82
+ start += chunk_size - overlap
83
+ return chunks
84
+
85
+ # -------- Complete Pipeline --------
86
+
87
+ def process_file(file_path, file_type):
88
+ if file_type == "pdf":
89
+ text = extract_text_from_pdf(file_path)
90
+ elif file_type == "docx":
91
+ text = extract_text_from_docx(file_path)
92
+ elif file_type == "txt":
93
+ text = extract_text_from_txt(file_path)
94
+ elif file_type == "md":
95
+ text = extract_text_from_md(file_path)
96
+ else:
97
+ raise ValueError(f"Unsupported file type: {file_type}")
98
+
99
+ chunks = chunk_text(text)
100
+ vectors = []
101
+ for i, chunk in enumerate(chunks):
102
+ vector = embed_text(chunk)
103
+ vector_id = f"{os.path.basename(file_path)}_chunk_{i}"
104
+ vectors.append((vector_id, vector))
105
+
106
+ index.upsert(vectors)
107
+
108
+ #----retrieve from pinecone------
109
+ def retrieve_from_pinecone(query: str, top_k: int = 5):
110
+ # Embed the query text
111
+ query_vector = embed_text(query)
112
+
113
+ # Query Pinecone index
114
+ result = index.query(vector=query_vector, top_k=top_k, include_metadata=True)
115
+
116
+ # Parse and return results (ID, score, metadata)
117
+ matches = []
118
+ for match in result['matches']:
119
+ matches.append({
120
+ 'id': match['id'],
121
+ 'score': match['score'],
122
+ 'metadata': match.get('metadata', {})
123
+ })
124
+ return matches