Spaces:
Sleeping
Sleeping
File size: 4,292 Bytes
c76be51 0730116 c76be51 0730116 c76be51 0730116 6c1417f 0730116 6c1417f 0730116 6c1417f 0730116 6c1417f 0730116 3fc053b 0730116 3fc053b 6c1417f 3fc053b 0730116 3fc053b 0730116 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import gradio as gr
import numpy as np
from pypdf import PdfReader
import re
from sentence_transformers import SentenceTransformer
import csv
import google.generativeai as genai
# Configure your API key
genai.configure(api_key="AIzaSyBgsd2j_InSYc7Zm8qIIe7yqWPworfbCS8")
def extract_text_data(path):
reader = PdfReader(path)
text = ''
for page in reader.pages:
text += page.extract_text()
return text
def clean_text(text):
text = text.replace('\u2029\u2029', '\n')
text = text.replace('\u2029', ' ')
text = text.replace('\u2010', '-')
text = text.replace(r"\'", "'")
return text
def chunk_text(text, chunk_size=500, overlap=100):
clean = clean_text(text) # Ensure text is preprocessed
words = clean.split() # Split by words to avoid breaking mid-word
chunks = []
start = 0 # Start index for chunking
while start < len(words):
end = start + chunk_size # Define chunk endpoint
chunk = " ".join(words[start:end]) # Get words within the chunk
chunks.append(chunk.strip()) # Strip extra spaces
start += chunk_size - overlap # Move start forward with overlap
return chunks
def generate_embeddings(chunks, model_name="all-MiniLM-L6-v2"):
model = SentenceTransformer(model_name)
embeddings = model.encode(chunks)
return embeddings
def store_in_database(chunks, embeddings):
with open("embeddings.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["text", "embedding"])
for chunk, embedding in zip(chunks, embeddings):
embedding = np.array(embedding)
writer.writerow([chunk, ",".join(map(str, embedding))])
return
def cosine_similarity(vector1, vector2):
dot_product = np.dot(vector1, vector2)
normVector1 = np.linalg.norm(vector1)
normVector2 = np.linalg.norm(vector2)
similarity = dot_product / (normVector1 * normVector2)
return similarity
def load_from_database(filepath):
chunks = []
embeddings = []
with open(filepath, "r", newline="") as f:
reader = csv.reader(f)
next(reader) # Skip header
for row in reader:
chunk = row[0]
embedding = np.array(list(map(float, row[1].split(","))))
chunks.append(chunk)
embeddings.append(embedding)
return chunks, np.array(embeddings)
def semantic_search(queryEmbedding, topK=5):
dbChunks, dbEmbeddings = load_from_database("embeddings.csv")
similarities = [cosine_similarity(dbEmbedding, queryEmbedding) for dbEmbedding in dbEmbeddings]
topIndex = np.argsort(similarities)[-topK:][::-1]
topChunks = [dbChunks[i] for i in topIndex]
return topChunks
def insert_in_LMM_prompt(retrievedContext, query, model_name="gemini-1.5-flash-001"):
prompt = f"""
You are a helpful and responsible AI assistant providing professional guidance for healthcare staff.
The user has provided a knowledge base with relevant medical training materials.
Use only the retrieved context below to answer the question factually and safely.
Context:
{retrievedContext}
Question:
{query}
Answer:
"""
model = genai.GenerativeModel(model_name)
response = model.generate_content(prompt)
return response.text
def pipeline(filePath, query):
text = extract_text_data(filePath)
chunks = chunk_text(text)
fileEmbeddings = generate_embeddings(chunks)
store_in_database(chunks, fileEmbeddings)
queryEmbeddings = generate_embeddings([query])[0]
relevantData = semantic_search(queryEmbeddings)
answer = insert_in_LMM_prompt(relevantData, query)
return answer
def gradio_interface(file, question):
return pipeline(file.name, question)
# Create the Gradio interface
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.File(label="Upload PDF"),
gr.Textbox(label="Ask a Question")
],
outputs="text",
live=False, # Disable live updates
title="RAG System Web App", # Title of the app
description="Upload a PDF and ask a question to extract information from it.", # Optional description
allow_flagging="never",
)
# Launch the interface
iface.launch()
|