Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, UploadFile, File, HTTPException
|
| 2 |
+
from fastapi.responses import JSONResponse
|
| 3 |
+
import fitz
|
| 4 |
+
from transformers import pipeline
|
| 5 |
+
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
| 6 |
+
from pinecone import Pinecone
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import time
|
| 9 |
+
# huggingface-cli login
|
| 10 |
+
|
| 11 |
+
app = FastAPI()
|
| 12 |
+
pc_api_key = "6aa08e79-86e2-4049-b16e-67e4cdafc78f"
|
| 13 |
+
index_name = 'project'
|
| 14 |
+
pc = Pinecone(api_key=pc_api_key, environment='gcp-starter', ssl_verify=False)
|
| 15 |
+
index = pc.Index(index_name)
|
| 16 |
+
embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", encode_kwargs={'batch_size': 32})
|
| 17 |
+
pdf_processed = False
|
| 18 |
+
|
| 19 |
+
def extract_text_from_pdf(pdf_file):
|
| 20 |
+
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
| 21 |
+
text = ""
|
| 22 |
+
for page in doc:
|
| 23 |
+
text += page.get_text()
|
| 24 |
+
return text
|
| 25 |
+
|
| 26 |
+
def chunk_text(text, chunk_size=40):
|
| 27 |
+
words = text.split()
|
| 28 |
+
return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
|
| 29 |
+
|
| 30 |
+
def delete_previous_vectors():
|
| 31 |
+
index.delete(delete_all=True, namespace='project')
|
| 32 |
+
|
| 33 |
+
def upsert_vectors(chunks, embed_model):
|
| 34 |
+
df = pd.DataFrame(chunks, columns=['content'])
|
| 35 |
+
ids = df.index.astype(str).tolist()
|
| 36 |
+
x = embed_model.embed_documents(df['content'].tolist())
|
| 37 |
+
metadata = [{'text': chunk} for chunk in df['content']]
|
| 38 |
+
index.upsert(vectors=zip(ids, x, metadata), namespace="project")
|
| 39 |
+
|
| 40 |
+
def pinecone_db(question, embed_model):
|
| 41 |
+
ques_x = embed_model.embed_query(question)
|
| 42 |
+
similar_content = index.query(vector=ques_x, top_k=2, namespace='project')
|
| 43 |
+
similar_ids = [i['id'] for i in similar_content['matches']]
|
| 44 |
+
ctx = index.fetch(ids=similar_ids, namespace='project')
|
| 45 |
+
return [ctx['vectors'][vec_id]['metadata']['text'] for vec_id in similar_ids]
|
| 46 |
+
|
| 47 |
+
@app.post("/upload_pdf/")
|
| 48 |
+
async def upload_pdf(file: UploadFile = File(...)):
|
| 49 |
+
global pdf_processed
|
| 50 |
+
if pdf_processed:
|
| 51 |
+
return JSONResponse(content={"message": "PDF already processed!"}, status_code=200)
|
| 52 |
+
try:
|
| 53 |
+
pdf_text = extract_text_from_pdf(file.file)
|
| 54 |
+
chunks = chunk_text(pdf_text)
|
| 55 |
+
|
| 56 |
+
delete_previous_vectors()
|
| 57 |
+
upsert_vectors(chunks, embed_model)
|
| 58 |
+
for _ in range(10):
|
| 59 |
+
time.sleep(2)
|
| 60 |
+
if index.describe_index_stats()['total_vector_count'] > 0:
|
| 61 |
+
pdf_processed = True
|
| 62 |
+
return JSONResponse(content={"message": "PDF processed and vectors upserted!"}, status_code=200)
|
| 63 |
+
raise HTTPException(status_code=500, detail="Failed to upsert vectors")
|
| 64 |
+
except Exception as e:
|
| 65 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 66 |
+
|
| 67 |
+
@app.post("/ask_question/")
|
| 68 |
+
async def ask_question(question: str, model: str):
|
| 69 |
+
if not question:
|
| 70 |
+
raise HTTPException(status_code=400, detail="Question cannot be empty")
|
| 71 |
+
|
| 72 |
+
context = pinecone_db(question, embed_model)
|
| 73 |
+
# print(context)
|
| 74 |
+
if model == "Default":
|
| 75 |
+
qa_pipeline = pipeline("question-answering", max_answer_length=512)
|
| 76 |
+
elif model == "deepset/roberta-base-squad2":
|
| 77 |
+
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", max_answer_length=512)
|
| 78 |
+
elif model == "llama":
|
| 79 |
+
qa_pipeline = pipeline("text-generation", model="meta-llama/Llama-3.2-1B-Instruct")
|
| 80 |
+
|
| 81 |
+
if model != "llama":
|
| 82 |
+
answer = qa_pipeline(question=question, context=str(context))
|
| 83 |
+
else:
|
| 84 |
+
prompt = f'''
|
| 85 |
+
<Instruction>: Answer to the following question using the provided context. Only provide Answer and explanation.
|
| 86 |
+
<Context>: {".".join(context)}
|
| 87 |
+
<Question>: {question}
|
| 88 |
+
<Answer>:
|
| 89 |
+
'''
|
| 90 |
+
detailed_explanation = qa_pipeline(prompt, max_new_tokens=256)
|
| 91 |
+
generated_text = detailed_explanation[0]['generated_text']
|
| 92 |
+
answer = generated_text[generated_text.find("<Answer>:") + len("<Answer>:"):].strip()
|
| 93 |
+
return JSONResponse(content={"answer": answer}, status_code=200)
|