Spaces:

iaravagni
/

LLMs_Assignment_3

Sleeping

iaravagni

chunk size modification

6c1417f 11 months ago

4.29 kB

	import gradio as gr
	import numpy as np
	from pypdf import PdfReader
	import re
	from sentence_transformers import SentenceTransformer
	import csv
	import google.generativeai as genai

	# Configure your API key
	genai.configure(api_key="AIzaSyBgsd2j_InSYc7Zm8qIIe7yqWPworfbCS8")

	def extract_text_data(path):
	reader = PdfReader(path)
	text = ''
	for page in reader.pages:
	text += page.extract_text()
	return text

	def clean_text(text):
	text = text.replace('\u2029\u2029', '\n')
	text = text.replace('\u2029', ' ')
	text = text.replace('\u2010', '-')
	text = text.replace(r"\'", "'")
	return text


	def chunk_text(text, chunk_size=500, overlap=100):

	clean = clean_text(text) # Ensure text is preprocessed
	words = clean.split() # Split by words to avoid breaking mid-word

	chunks = []
	start = 0 # Start index for chunking

	while start < len(words):
	end = start + chunk_size # Define chunk endpoint
	chunk = " ".join(words[start:end]) # Get words within the chunk
	chunks.append(chunk.strip()) # Strip extra spaces
	start += chunk_size - overlap # Move start forward with overlap

	return chunks

	def generate_embeddings(chunks, model_name="all-MiniLM-L6-v2"):
	model = SentenceTransformer(model_name)
	embeddings = model.encode(chunks)
	return embeddings

	def store_in_database(chunks, embeddings):
	with open("embeddings.csv", "w", newline="", encoding="utf-8") as f:
	writer = csv.writer(f)
	writer.writerow(["text", "embedding"])
	for chunk, embedding in zip(chunks, embeddings):
	embedding = np.array(embedding)
	writer.writerow([chunk, ",".join(map(str, embedding))])
	return

	def cosine_similarity(vector1, vector2):
	dot_product = np.dot(vector1, vector2)
	normVector1 = np.linalg.norm(vector1)
	normVector2 = np.linalg.norm(vector2)
	similarity = dot_product / (normVector1 * normVector2)
	return similarity

	def load_from_database(filepath):
	chunks = []
	embeddings = []
	with open(filepath, "r", newline="") as f:
	reader = csv.reader(f)
	next(reader) # Skip header
	for row in reader:
	chunk = row[0]
	embedding = np.array(list(map(float, row[1].split(","))))
	chunks.append(chunk)
	embeddings.append(embedding)
	return chunks, np.array(embeddings)

	def semantic_search(queryEmbedding, topK=5):
	dbChunks, dbEmbeddings = load_from_database("embeddings.csv")
	similarities = [cosine_similarity(dbEmbedding, queryEmbedding) for dbEmbedding in dbEmbeddings]
	topIndex = np.argsort(similarities)[-topK:][::-1]
	topChunks = [dbChunks[i] for i in topIndex]
	return topChunks

	def insert_in_LMM_prompt(retrievedContext, query, model_name="gemini-1.5-flash-001"):
	prompt = f"""
	You are a helpful and responsible AI assistant providing professional guidance for healthcare staff.

	The user has provided a knowledge base with relevant medical training materials.

	Use only the retrieved context below to answer the question factually and safely.


	Context:
	{retrievedContext}

	Question:
	{query}

	Answer:
	"""
	model = genai.GenerativeModel(model_name)
	response = model.generate_content(prompt)
	return response.text

	def pipeline(filePath, query):
	text = extract_text_data(filePath)
	chunks = chunk_text(text)
	fileEmbeddings = generate_embeddings(chunks)
	store_in_database(chunks, fileEmbeddings)
	queryEmbeddings = generate_embeddings([query])[0]
	relevantData = semantic_search(queryEmbeddings)
	answer = insert_in_LMM_prompt(relevantData, query)
	return answer

	def gradio_interface(file, question):
	return pipeline(file.name, question)

	# Create the Gradio interface
	iface = gr.Interface(
	fn=gradio_interface,
	inputs=[
	gr.File(label="Upload PDF"),
	gr.Textbox(label="Ask a Question")
	],
	outputs="text",
	live=False, # Disable live updates
	title="RAG System Web App", # Title of the app
	description="Upload a PDF and ask a question to extract information from it.", # Optional description
	allow_flagging="never",
	)

	# Launch the interface
	iface.launch()