Spaces:

Yashnik
/

ChatPDF

Sleeping

App Files Files Community

Yashnik commited on Oct 11, 2024

Commit

b879c82

verified ·

1 Parent(s): 9120b16

Create app.py

Browse files

Files changed (1) hide show

app.py +93 -0

app.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.responses import JSONResponse
+import fitz
+from transformers import pipeline
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+from pinecone import Pinecone
+import pandas as pd
+import time
+# huggingface-cli login
+app = FastAPI()
+pc_api_key = "6aa08e79-86e2-4049-b16e-67e4cdafc78f"
+index_name = 'project'
+pc = Pinecone(api_key=pc_api_key, environment='gcp-starter', ssl_verify=False)
+index = pc.Index(index_name)
+embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", encode_kwargs={'batch_size': 32})
+pdf_processed = False
+def extract_text_from_pdf(pdf_file):
+    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
+    text = ""
+    for page in doc:
+        text += page.get_text()
+    return text
+def chunk_text(text, chunk_size=40):
+    words = text.split()
+    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
+def delete_previous_vectors():
+    index.delete(delete_all=True, namespace='project')
+def upsert_vectors(chunks, embed_model):
+    df = pd.DataFrame(chunks, columns=['content'])
+    ids = df.index.astype(str).tolist()
+    x = embed_model.embed_documents(df['content'].tolist())
+    metadata = [{'text': chunk} for chunk in df['content']]
+    index.upsert(vectors=zip(ids, x, metadata), namespace="project")
+def pinecone_db(question, embed_model):
+    ques_x = embed_model.embed_query(question)
+    similar_content = index.query(vector=ques_x, top_k=2, namespace='project')
+    similar_ids = [i['id'] for i in similar_content['matches']]
+    ctx = index.fetch(ids=similar_ids, namespace='project')
+    return [ctx['vectors'][vec_id]['metadata']['text'] for vec_id in similar_ids]
+@app.post("/upload_pdf/")
+async def upload_pdf(file: UploadFile = File(...)):
+    global pdf_processed
+    if pdf_processed:
+        return JSONResponse(content={"message": "PDF already processed!"}, status_code=200)
+    try:
+        pdf_text = extract_text_from_pdf(file.file)
+        chunks = chunk_text(pdf_text)
+        delete_previous_vectors()
+        upsert_vectors(chunks, embed_model)
+        for _ in range(10):
+            time.sleep(2)
+            if index.describe_index_stats()['total_vector_count'] > 0:
+                pdf_processed = True
+                return JSONResponse(content={"message": "PDF processed and vectors upserted!"}, status_code=200)
+        raise HTTPException(status_code=500, detail="Failed to upsert vectors")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/ask_question/")
+async def ask_question(question: str, model: str):
+    if not question:
+        raise HTTPException(status_code=400, detail="Question cannot be empty")
+    context = pinecone_db(question, embed_model)
+    # print(context)
+    if model == "Default":
+        qa_pipeline = pipeline("question-answering", max_answer_length=512)
+    elif model == "deepset/roberta-base-squad2":
+        qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", max_answer_length=512)
+    elif model == "llama":
+        qa_pipeline = pipeline("text-generation", model="meta-llama/Llama-3.2-1B-Instruct")
+    if model != "llama":
+        answer = qa_pipeline(question=question, context=str(context))
+    else:
+        prompt = f'''
+        <Instruction>: Answer to the following question using the provided context. Only provide Answer and explanation.
+        <Context>: {".".join(context)}
+        <Question>: {question}
+        <Answer>:
+        '''
+        detailed_explanation = qa_pipeline(prompt, max_new_tokens=256)
+        generated_text = detailed_explanation[0]['generated_text']
+        answer = generated_text[generated_text.find("<Answer>:") + len("<Answer>:"):].strip()
+    return JSONResponse(content={"answer": answer}, status_code=200)