Spaces:

Prithivi-nanda
/

hammock

Sleeping

App Files Files Community

Prithivi-nanda commited on Aug 26, 2025

Commit

cbaaac0

1 Parent(s): bfb07bd

initial commit

Browse files

Files changed (6) hide show

.gitignore +3 -0
Dockerfile +13 -0
main.py +167 -0
requirements.txt +13 -0
test.py +28 -0
utils.py +124 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+venv/
+__pycache__/
+.env

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.11
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

main.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import os
+import shutil
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.responses import JSONResponse
+from utils import process_file,embed_text  # Assuming your previous code is in utils.py
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from pinecone import Pinecone
+from dotenv import load_dotenv
+import requests
+load_dotenv()
+app = FastAPI(title="Document Embedding Uploader")
+PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
+PINECONE_INDEX = os.getenv("PINECONE_INDEX") or "studybuddy-notes"
+pc = Pinecone(api_key=PINECONE_API_KEY)
+index = pc.Index(PINECONE_INDEX)
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+GROQ_BASE_URL = "https://api.groq.com/openai/v1/chat/completions"
+HEADERS = {
+    "Authorization": f"Bearer {GROQ_API_KEY}",
+    "Content-Type": "application/json"
+}
+# CORS middleware (optional, for testing with frontend)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+UPLOAD_FOLDER = "uploads"
+os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+# Supported file types and their extensions
+ALLOWED_EXTENSIONS = {
+    "pdf": "pdf",
+    "docx": "docx",
+    "txt": "txt",
+    "md": "md",
+}
+def get_file_type(filename: str):
+    ext = filename.split(".")[-1].lower()
+    if ext in ALLOWED_EXTENSIONS.values():
+        return ext
+    return None
+@app.post("/upload/")
+async def upload_file(file: UploadFile = File(...)):
+    file_type = get_file_type(file.filename)
+    if not file_type:
+        raise HTTPException(status_code=400, detail="Unsupported file type")
+    file_location = os.path.join(UPLOAD_FOLDER, file.filename)
+    with open(file_location, "wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+    file.file.close()
+    try:
+        process_file(file_location, file_type)
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e)})
+    return {"message": f"File '{file.filename}' processed and embedded successfully"}
+class QueryRequest(BaseModel):
+    query: str
+@app.post("/query/")
+async def query_llm(req: QueryRequest):
+    try:
+        # Use your existing embed_text function for query embedding
+        query_embedding = embed_text(req.query).tolist()
+        # Query Pinecone index
+        result = index.query(vector=query_embedding, top_k=5, include_metadata=True)
+        docs = [match.get("metadata", {}).get("text", "") for match in result.get("matches", []) if "metadata" in match]
+        context = "\n\n".join(docs) if docs else "No relevant context found."
+        prompt = (
+            f"You are a helpful assistant. Use the following context to answer the question.\n\n"
+            f"Context:\n{context}\n\nQuestion: {req.query}\nAnswer:"
+        )
+        # Call Groq LLM API
+        response = requests.post(
+            GROQ_BASE_URL,
+            headers=HEADERS,
+            json={
+                "model": "llama3-70b-8192",
+                "messages": [
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": prompt}
+                ],
+                "max_tokens": 512
+            }
+        )
+        response.raise_for_status()
+        answer = response.json()["choices"][0]["message"]["content"].strip()
+        return {"answer": answer}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+class MindMapRequest(BaseModel):
+    query: str
+@app.post("/generate-mindmap/")
+async def generate_mindmap(req: MindMapRequest):
+    prompt = (
+        "You are a helpful assistant that creates mind map nodes from the user's query. "
+        "Generate output strictly in JSON array format where each node has the following schema:\n\n"
+        "{ \n"
+        "  id: string,\n"
+        "  label: string,\n"
+        "  children: string[],\n"
+        "  explanation?: string,\n"
+        "  metadata?: { color: string, icon: string },\n"
+        "  parent_id?: string\n"
+        "}\n\n"
+        f"User query: \"{req.query}\"\n\n"
+        "Please respond ONLY with valid JSON."
+    )
+    try:
+        response = requests.post(
+            GROQ_BASE_URL,
+            headers=HEADERS,
+            json={
+                "model": "llama3-70b-8192",
+                "messages": [
+                    {"role": "system", "content": "You are an expert mind map generator."},
+                    {"role": "user", "content": prompt}
+                ],
+                "max_tokens": 1024
+            }
+        )
+        response.raise_for_status()
+        content = response.json()["choices"][0]["message"]["content"].strip()
+        # Validate JSON format by parsing (catch errors)
+        import json
+        mindmap_nodes = json.loads(content)
+        # Optional: Validate schema here or sanitize
+        return mindmap_nodes
+    except requests.HTTPError as http_err:
+        raise HTTPException(status_code=response.status_code, detail=f"LLM API error: {http_err}")
+    except json.JSONDecodeError:
+        raise HTTPException(status_code=500, detail="LLM responded with invalid JSON")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/")
+def root():
+    return {"message": "Document embedding uploader API is running."}

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fastapi==0.116.1
+uvicorn==0.35.0
+PyPDF2==3.0.0
+pdf2image==1.16.3
+pytesseract==0.3.10
+docx2txt==0.8.0
+transformers==4.35.0
+torch==2.1.0
+pinecone==7.3.0
+python-dotenv==1.1.1
+pymupdf==1.26.4
+python-multipart==0.0.20
+"numpy<2"

test.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+from pinecone import Pinecone,ServerlessSpec
+from dotenv import load_dotenv
+load_dotenv()
+PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
+INDEX_NAME = "studybuddy-notes"
+DIMENSION = 384  # Adjust based on your embedding size
+# Initialize Pinecone client
+pc = Pinecone(api_key=PINECONE_API_KEY)
+# List existing indexes
+indexes = pc.list_indexes()
+if INDEX_NAME not in indexes:
+    pc.create_index(
+        name=INDEX_NAME,
+        dimension=DIMENSION,
+        metric="cosine",
+        spec=ServerlessSpec(
+            cloud="aws",
+            region="us-east-1"
+        )
+    )

utils.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import os
+from PyPDF2 import PdfReader
+import docx2txt
+from pinecone import Pinecone, ServerlessSpec
+from transformers import AutoTokenizer, AutoModel
+import torch
+from dotenv import load_dotenv
+load_dotenv()
+# -------- Document Text Extraction --------
+def extract_text_from_pdf(file_path: str, use_ocr: bool = True) -> str:
+    text = ""
+    try:
+        reader = PdfReader(file_path)
+        for page in reader.pages:
+            text += page.extract_text() or ""
+    except Exception as e:
+        print(f"PDF text extraction error: {e}")
+    return text
+def extract_text_from_docx(file_path: str) -> str:
+    try:
+        return docx2txt.process(file_path)
+    except Exception as e:
+        print(f"DOCX extraction error: {e}")
+        return ""
+def extract_text_from_txt(file_path: str) -> str:
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            return f.read()
+    except Exception as e:
+        print(f"TXT extraction error: {e}")
+        return ""
+def extract_text_from_md(file_path: str) -> str:
+    return extract_text_from_txt(file_path)
+# -------- Hugging Face Embedding Setup --------
+tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+model.eval()
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output.last_hidden_state
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, dim=1)
+    sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
+    return sum_embeddings / sum_mask
+def embed_text(text):
+    encoded_input = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt')
+    with torch.no_grad():
+        model_output = model(**encoded_input)
+    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+    normalized_embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+    return normalized_embeddings[0].cpu().numpy()
+# -------- Pinecone Setup --------
+PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
+INDEX_NAME = "studybuddy-notes"
+DIMENSION = 384  # Embedding dimension from the model
+pc = Pinecone(api_key=PINECONE_API_KEY)
+index = pc.Index(INDEX_NAME)
+# -------- Text Chunking --------
+def chunk_text(text, chunk_size=500, overlap=100):
+    if overlap >= chunk_size:
+        raise ValueError("Overlap must be smaller than chunk size")
+    chunks = []
+    start = 0
+    text_length = len(text)
+    while start < text_length:
+        end = start + chunk_size
+        chunks.append(text[start:end])
+        start += chunk_size - overlap
+    return chunks
+# -------- Complete Pipeline --------
+def process_file(file_path, file_type):
+    if file_type == "pdf":
+        text = extract_text_from_pdf(file_path)
+    elif file_type == "docx":
+        text = extract_text_from_docx(file_path)
+    elif file_type == "txt":
+        text = extract_text_from_txt(file_path)
+    elif file_type == "md":
+        text = extract_text_from_md(file_path)
+    else:
+        raise ValueError(f"Unsupported file type: {file_type}")
+    chunks = chunk_text(text)
+    vectors = []
+    for i, chunk in enumerate(chunks):
+        vector = embed_text(chunk)
+        vector_id = f"{os.path.basename(file_path)}_chunk_{i}"
+        vectors.append((vector_id, vector))
+    index.upsert(vectors)
+#----retrieve from pinecone------
+def retrieve_from_pinecone(query: str, top_k: int = 5):
+    # Embed the query text
+    query_vector = embed_text(query)
+    # Query Pinecone index
+    result = index.query(vector=query_vector, top_k=top_k, include_metadata=True)
+    # Parse and return results (ID, score, metadata)
+    matches = []
+    for match in result['matches']:
+        matches.append({
+            'id': match['id'],
+            'score': match['score'],
+            'metadata': match.get('metadata', {})
+        })
+    return matches