Yashnik commited on
Commit
b879c82
·
verified ·
1 Parent(s): 9120b16

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -0
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, HTTPException
2
+ from fastapi.responses import JSONResponse
3
+ import fitz
4
+ from transformers import pipeline
5
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
6
+ from pinecone import Pinecone
7
+ import pandas as pd
8
+ import time
9
+ # huggingface-cli login
10
+
11
+ app = FastAPI()
12
+ pc_api_key = "6aa08e79-86e2-4049-b16e-67e4cdafc78f"
13
+ index_name = 'project'
14
+ pc = Pinecone(api_key=pc_api_key, environment='gcp-starter', ssl_verify=False)
15
+ index = pc.Index(index_name)
16
+ embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", encode_kwargs={'batch_size': 32})
17
+ pdf_processed = False
18
+
19
+ def extract_text_from_pdf(pdf_file):
20
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
21
+ text = ""
22
+ for page in doc:
23
+ text += page.get_text()
24
+ return text
25
+
26
+ def chunk_text(text, chunk_size=40):
27
+ words = text.split()
28
+ return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
29
+
30
+ def delete_previous_vectors():
31
+ index.delete(delete_all=True, namespace='project')
32
+
33
+ def upsert_vectors(chunks, embed_model):
34
+ df = pd.DataFrame(chunks, columns=['content'])
35
+ ids = df.index.astype(str).tolist()
36
+ x = embed_model.embed_documents(df['content'].tolist())
37
+ metadata = [{'text': chunk} for chunk in df['content']]
38
+ index.upsert(vectors=zip(ids, x, metadata), namespace="project")
39
+
40
+ def pinecone_db(question, embed_model):
41
+ ques_x = embed_model.embed_query(question)
42
+ similar_content = index.query(vector=ques_x, top_k=2, namespace='project')
43
+ similar_ids = [i['id'] for i in similar_content['matches']]
44
+ ctx = index.fetch(ids=similar_ids, namespace='project')
45
+ return [ctx['vectors'][vec_id]['metadata']['text'] for vec_id in similar_ids]
46
+
47
+ @app.post("/upload_pdf/")
48
+ async def upload_pdf(file: UploadFile = File(...)):
49
+ global pdf_processed
50
+ if pdf_processed:
51
+ return JSONResponse(content={"message": "PDF already processed!"}, status_code=200)
52
+ try:
53
+ pdf_text = extract_text_from_pdf(file.file)
54
+ chunks = chunk_text(pdf_text)
55
+
56
+ delete_previous_vectors()
57
+ upsert_vectors(chunks, embed_model)
58
+ for _ in range(10):
59
+ time.sleep(2)
60
+ if index.describe_index_stats()['total_vector_count'] > 0:
61
+ pdf_processed = True
62
+ return JSONResponse(content={"message": "PDF processed and vectors upserted!"}, status_code=200)
63
+ raise HTTPException(status_code=500, detail="Failed to upsert vectors")
64
+ except Exception as e:
65
+ raise HTTPException(status_code=500, detail=str(e))
66
+
67
+ @app.post("/ask_question/")
68
+ async def ask_question(question: str, model: str):
69
+ if not question:
70
+ raise HTTPException(status_code=400, detail="Question cannot be empty")
71
+
72
+ context = pinecone_db(question, embed_model)
73
+ # print(context)
74
+ if model == "Default":
75
+ qa_pipeline = pipeline("question-answering", max_answer_length=512)
76
+ elif model == "deepset/roberta-base-squad2":
77
+ qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", max_answer_length=512)
78
+ elif model == "llama":
79
+ qa_pipeline = pipeline("text-generation", model="meta-llama/Llama-3.2-1B-Instruct")
80
+
81
+ if model != "llama":
82
+ answer = qa_pipeline(question=question, context=str(context))
83
+ else:
84
+ prompt = f'''
85
+ <Instruction>: Answer to the following question using the provided context. Only provide Answer and explanation.
86
+ <Context>: {".".join(context)}
87
+ <Question>: {question}
88
+ <Answer>:
89
+ '''
90
+ detailed_explanation = qa_pipeline(prompt, max_new_tokens=256)
91
+ generated_text = detailed_explanation[0]['generated_text']
92
+ answer = generated_text[generated_text.find("<Answer>:") + len("<Answer>:"):].strip()
93
+ return JSONResponse(content={"answer": answer}, status_code=200)