Spaces:

iaravagni
/

LLMs_Assignment_3

Sleeping

File size: 4,292 Bytes

import gradio as gr
import numpy as np
from pypdf import PdfReader
import re
from sentence_transformers import SentenceTransformer
import csv
import google.generativeai as genai

# Configure your API key
genai.configure(api_key="AIzaSyBgsd2j_InSYc7Zm8qIIe7yqWPworfbCS8") 

def extract_text_data(path):
    reader = PdfReader(path)
    text = ''
    for page in reader.pages:
        text += page.extract_text()
    return text

def clean_text(text):
    text = text.replace('\u2029\u2029', '\n')
    text = text.replace('\u2029', ' ')
    text = text.replace('\u2010', '-')
    text = text.replace(r"\'", "'")
    return text


def chunk_text(text, chunk_size=500, overlap=100):
    
    clean = clean_text(text)  # Ensure text is preprocessed
    words = clean.split()  # Split by words to avoid breaking mid-word
    
    chunks = []
    start = 0  # Start index for chunking
    
    while start < len(words):
        end = start + chunk_size  # Define chunk endpoint
        chunk = " ".join(words[start:end])  # Get words within the chunk
        chunks.append(chunk.strip())  # Strip extra spaces
        start += chunk_size - overlap  # Move start forward with overlap

    return chunks

def generate_embeddings(chunks, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks)
    return embeddings

def store_in_database(chunks, embeddings):
    with open("embeddings.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["text", "embedding"])
        for chunk, embedding in zip(chunks, embeddings):
            embedding = np.array(embedding) 
            writer.writerow([chunk, ",".join(map(str, embedding))])
    return

def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    normVector1 = np.linalg.norm(vector1)
    normVector2 = np.linalg.norm(vector2)
    similarity = dot_product / (normVector1 * normVector2)
    return similarity

def load_from_database(filepath):
    chunks = []
    embeddings = []
    with open(filepath, "r", newline="") as f:
        reader = csv.reader(f)
        next(reader)  # Skip header
        for row in reader:
            chunk = row[0]
            embedding = np.array(list(map(float, row[1].split(","))))
            chunks.append(chunk)
            embeddings.append(embedding)
    return chunks, np.array(embeddings)

def semantic_search(queryEmbedding, topK=5):
    dbChunks, dbEmbeddings = load_from_database("embeddings.csv")
    similarities = [cosine_similarity(dbEmbedding, queryEmbedding) for dbEmbedding in dbEmbeddings]
    topIndex = np.argsort(similarities)[-topK:][::-1]
    topChunks = [dbChunks[i] for i in topIndex]
    return topChunks

def insert_in_LMM_prompt(retrievedContext, query, model_name="gemini-1.5-flash-001"):
    prompt = f"""
        You are a helpful and responsible AI assistant providing professional guidance for healthcare staff.

        The user has provided a knowledge base with relevant medical training materials.

        Use only the retrieved context below to answer the question factually and safely.


        Context:
        {retrievedContext}

        Question:
        {query}

        Answer:
        """
    model = genai.GenerativeModel(model_name)
    response = model.generate_content(prompt)
    return response.text

def pipeline(filePath, query):
    text = extract_text_data(filePath)
    chunks = chunk_text(text)
    fileEmbeddings = generate_embeddings(chunks)
    store_in_database(chunks, fileEmbeddings)
    queryEmbeddings = generate_embeddings([query])[0]
    relevantData = semantic_search(queryEmbeddings)
    answer = insert_in_LMM_prompt(relevantData, query)
    return answer

def gradio_interface(file, question):
    return pipeline(file.name, question)

# Create the Gradio interface
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.File(label="Upload PDF"),
        gr.Textbox(label="Ask a Question")
    ],
    outputs="text",
    live=False,  # Disable live updates
    title="RAG System Web App",  # Title of the app
    description="Upload a PDF and ask a question to extract information from it.",  # Optional description
    allow_flagging="never", 
)

# Launch the interface
iface.launch()