Spaces:

Yashnik
/

ChatPDF

Sleeping

App Files Files Community

ChatPDF / app.py

Yashnik

Create app.py

b879c82 verified over 1 year ago

raw

history blame contribute delete

3.89 kB

	from fastapi import FastAPI, UploadFile, File, HTTPException
	from fastapi.responses import JSONResponse
	import fitz
	from transformers import pipeline
	from langchain.embeddings.huggingface import HuggingFaceEmbeddings
	from pinecone import Pinecone
	import pandas as pd
	import time
	# huggingface-cli login

	app = FastAPI()
	pc_api_key = "6aa08e79-86e2-4049-b16e-67e4cdafc78f"
	index_name = 'project'
	pc = Pinecone(api_key=pc_api_key, environment='gcp-starter', ssl_verify=False)
	index = pc.Index(index_name)
	embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", encode_kwargs={'batch_size': 32})
	pdf_processed = False

	def extract_text_from_pdf(pdf_file):
	doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
	text = ""
	for page in doc:
	text += page.get_text()
	return text

	def chunk_text(text, chunk_size=40):
	words = text.split()
	return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

	def delete_previous_vectors():
	index.delete(delete_all=True, namespace='project')

	def upsert_vectors(chunks, embed_model):
	df = pd.DataFrame(chunks, columns=['content'])
	ids = df.index.astype(str).tolist()
	x = embed_model.embed_documents(df['content'].tolist())
	metadata = [{'text': chunk} for chunk in df['content']]
	index.upsert(vectors=zip(ids, x, metadata), namespace="project")

	def pinecone_db(question, embed_model):
	ques_x = embed_model.embed_query(question)
	similar_content = index.query(vector=ques_x, top_k=2, namespace='project')
	similar_ids = [i['id'] for i in similar_content['matches']]
	ctx = index.fetch(ids=similar_ids, namespace='project')
	return [ctx['vectors'][vec_id]['metadata']['text'] for vec_id in similar_ids]

	@app.post("/upload_pdf/")
	async def upload_pdf(file: UploadFile = File(...)):
	global pdf_processed
	if pdf_processed:
	return JSONResponse(content={"message": "PDF already processed!"}, status_code=200)
	try:
	pdf_text = extract_text_from_pdf(file.file)
	chunks = chunk_text(pdf_text)

	delete_previous_vectors()
	upsert_vectors(chunks, embed_model)
	for _ in range(10):
	time.sleep(2)
	if index.describe_index_stats()['total_vector_count'] > 0:
	pdf_processed = True
	return JSONResponse(content={"message": "PDF processed and vectors upserted!"}, status_code=200)
	raise HTTPException(status_code=500, detail="Failed to upsert vectors")
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	@app.post("/ask_question/")
	async def ask_question(question: str, model: str):
	if not question:
	raise HTTPException(status_code=400, detail="Question cannot be empty")

	context = pinecone_db(question, embed_model)
	# print(context)
	if model == "Default":
	qa_pipeline = pipeline("question-answering", max_answer_length=512)
	elif model == "deepset/roberta-base-squad2":
	qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2", max_answer_length=512)
	elif model == "llama":
	qa_pipeline = pipeline("text-generation", model="meta-llama/Llama-3.2-1B-Instruct")

	if model != "llama":
	answer = qa_pipeline(question=question, context=str(context))
	else:
	prompt = f'''
	<Instruction>: Answer to the following question using the provided context. Only provide Answer and explanation.
	<Context>: {".".join(context)}
	<Question>: {question}
	<Answer>:
	'''
	detailed_explanation = qa_pipeline(prompt, max_new_tokens=256)
	generated_text = detailed_explanation[0]['generated_text']
	answer = generated_text[generated_text.find("<Answer>:") + len("<Answer>:"):].strip()
	return JSONResponse(content={"answer": answer}, status_code=200)