import gradio as gr import numpy as np from pypdf import PdfReader import re from sentence_transformers import SentenceTransformer import csv import google.generativeai as genai # Configure your API key genai.configure(api_key="AIzaSyBgsd2j_InSYc7Zm8qIIe7yqWPworfbCS8") def extract_text_data(path): reader = PdfReader(path) text = '' for page in reader.pages: text += page.extract_text() return text def clean_text(text): text = text.replace('\u2029\u2029', '\n') text = text.replace('\u2029', ' ') text = text.replace('\u2010', '-') text = text.replace(r"\'", "'") return text def chunk_text(text, chunk_size=500, overlap=100): clean = clean_text(text) # Ensure text is preprocessed words = clean.split() # Split by words to avoid breaking mid-word chunks = [] start = 0 # Start index for chunking while start < len(words): end = start + chunk_size # Define chunk endpoint chunk = " ".join(words[start:end]) # Get words within the chunk chunks.append(chunk.strip()) # Strip extra spaces start += chunk_size - overlap # Move start forward with overlap return chunks def generate_embeddings(chunks, model_name="all-MiniLM-L6-v2"): model = SentenceTransformer(model_name) embeddings = model.encode(chunks) return embeddings def store_in_database(chunks, embeddings): with open("embeddings.csv", "w", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(["text", "embedding"]) for chunk, embedding in zip(chunks, embeddings): embedding = np.array(embedding) writer.writerow([chunk, ",".join(map(str, embedding))]) return def cosine_similarity(vector1, vector2): dot_product = np.dot(vector1, vector2) normVector1 = np.linalg.norm(vector1) normVector2 = np.linalg.norm(vector2) similarity = dot_product / (normVector1 * normVector2) return similarity def load_from_database(filepath): chunks = [] embeddings = [] with open(filepath, "r", newline="") as f: reader = csv.reader(f) next(reader) # Skip header for row in reader: chunk = row[0] embedding = np.array(list(map(float, row[1].split(",")))) chunks.append(chunk) embeddings.append(embedding) return chunks, np.array(embeddings) def semantic_search(queryEmbedding, topK=5): dbChunks, dbEmbeddings = load_from_database("embeddings.csv") similarities = [cosine_similarity(dbEmbedding, queryEmbedding) for dbEmbedding in dbEmbeddings] topIndex = np.argsort(similarities)[-topK:][::-1] topChunks = [dbChunks[i] for i in topIndex] return topChunks def insert_in_LMM_prompt(retrievedContext, query, model_name="gemini-1.5-flash-001"): prompt = f""" You are a helpful and responsible AI assistant providing professional guidance for healthcare staff. The user has provided a knowledge base with relevant medical training materials. Use only the retrieved context below to answer the question factually and safely. Context: {retrievedContext} Question: {query} Answer: """ model = genai.GenerativeModel(model_name) response = model.generate_content(prompt) return response.text def pipeline(filePath, query): text = extract_text_data(filePath) chunks = chunk_text(text) fileEmbeddings = generate_embeddings(chunks) store_in_database(chunks, fileEmbeddings) queryEmbeddings = generate_embeddings([query])[0] relevantData = semantic_search(queryEmbeddings) answer = insert_in_LMM_prompt(relevantData, query) return answer def gradio_interface(file, question): return pipeline(file.name, question) # Create the Gradio interface iface = gr.Interface( fn=gradio_interface, inputs=[ gr.File(label="Upload PDF"), gr.Textbox(label="Ask a Question") ], outputs="text", live=False, # Disable live updates title="RAG System Web App", # Title of the app description="Upload a PDF and ask a question to extract information from it.", # Optional description allow_flagging="never", ) # Launch the interface iface.launch()