iaravagni
chunk size modification
6c1417f
import gradio as gr
import numpy as np
from pypdf import PdfReader
import re
from sentence_transformers import SentenceTransformer
import csv
import google.generativeai as genai
# Configure your API key
genai.configure(api_key="AIzaSyBgsd2j_InSYc7Zm8qIIe7yqWPworfbCS8")
def extract_text_data(path):
reader = PdfReader(path)
text = ''
for page in reader.pages:
text += page.extract_text()
return text
def clean_text(text):
text = text.replace('\u2029\u2029', '\n')
text = text.replace('\u2029', ' ')
text = text.replace('\u2010', '-')
text = text.replace(r"\'", "'")
return text
def chunk_text(text, chunk_size=500, overlap=100):
clean = clean_text(text) # Ensure text is preprocessed
words = clean.split() # Split by words to avoid breaking mid-word
chunks = []
start = 0 # Start index for chunking
while start < len(words):
end = start + chunk_size # Define chunk endpoint
chunk = " ".join(words[start:end]) # Get words within the chunk
chunks.append(chunk.strip()) # Strip extra spaces
start += chunk_size - overlap # Move start forward with overlap
return chunks
def generate_embeddings(chunks, model_name="all-MiniLM-L6-v2"):
model = SentenceTransformer(model_name)
embeddings = model.encode(chunks)
return embeddings
def store_in_database(chunks, embeddings):
with open("embeddings.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["text", "embedding"])
for chunk, embedding in zip(chunks, embeddings):
embedding = np.array(embedding)
writer.writerow([chunk, ",".join(map(str, embedding))])
return
def cosine_similarity(vector1, vector2):
dot_product = np.dot(vector1, vector2)
normVector1 = np.linalg.norm(vector1)
normVector2 = np.linalg.norm(vector2)
similarity = dot_product / (normVector1 * normVector2)
return similarity
def load_from_database(filepath):
chunks = []
embeddings = []
with open(filepath, "r", newline="") as f:
reader = csv.reader(f)
next(reader) # Skip header
for row in reader:
chunk = row[0]
embedding = np.array(list(map(float, row[1].split(","))))
chunks.append(chunk)
embeddings.append(embedding)
return chunks, np.array(embeddings)
def semantic_search(queryEmbedding, topK=5):
dbChunks, dbEmbeddings = load_from_database("embeddings.csv")
similarities = [cosine_similarity(dbEmbedding, queryEmbedding) for dbEmbedding in dbEmbeddings]
topIndex = np.argsort(similarities)[-topK:][::-1]
topChunks = [dbChunks[i] for i in topIndex]
return topChunks
def insert_in_LMM_prompt(retrievedContext, query, model_name="gemini-1.5-flash-001"):
prompt = f"""
You are a helpful and responsible AI assistant providing professional guidance for healthcare staff.
The user has provided a knowledge base with relevant medical training materials.
Use only the retrieved context below to answer the question factually and safely.
Context:
{retrievedContext}
Question:
{query}
Answer:
"""
model = genai.GenerativeModel(model_name)
response = model.generate_content(prompt)
return response.text
def pipeline(filePath, query):
text = extract_text_data(filePath)
chunks = chunk_text(text)
fileEmbeddings = generate_embeddings(chunks)
store_in_database(chunks, fileEmbeddings)
queryEmbeddings = generate_embeddings([query])[0]
relevantData = semantic_search(queryEmbeddings)
answer = insert_in_LMM_prompt(relevantData, query)
return answer
def gradio_interface(file, question):
return pipeline(file.name, question)
# Create the Gradio interface
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.File(label="Upload PDF"),
gr.Textbox(label="Ask a Question")
],
outputs="text",
live=False, # Disable live updates
title="RAG System Web App", # Title of the app
description="Upload a PDF and ask a question to extract information from it.", # Optional description
allow_flagging="never",
)
# Launch the interface
iface.launch()