File size: 4,292 Bytes
c76be51
0730116
 
 
 
 
 
c76be51
0730116
 
c76be51
0730116
 
 
 
 
 
 
 
 
 
 
 
 
 
6c1417f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0730116
 
 
 
 
 
 
6c1417f
0730116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c1417f
0730116
 
 
 
 
 
 
 
6c1417f
 
 
 
 
 
0730116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3fc053b
0730116
 
 
 
 
 
 
3fc053b
6c1417f
3fc053b
 
0730116
 
3fc053b
0730116
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import gradio as gr
import numpy as np
from pypdf import PdfReader
import re
from sentence_transformers import SentenceTransformer
import csv
import google.generativeai as genai

# Configure your API key
genai.configure(api_key="AIzaSyBgsd2j_InSYc7Zm8qIIe7yqWPworfbCS8") 

def extract_text_data(path):
    reader = PdfReader(path)
    text = ''
    for page in reader.pages:
        text += page.extract_text()
    return text

def clean_text(text):
    text = text.replace('\u2029\u2029', '\n')
    text = text.replace('\u2029', ' ')
    text = text.replace('\u2010', '-')
    text = text.replace(r"\'", "'")
    return text


def chunk_text(text, chunk_size=500, overlap=100):
    
    clean = clean_text(text)  # Ensure text is preprocessed
    words = clean.split()  # Split by words to avoid breaking mid-word
    
    chunks = []
    start = 0  # Start index for chunking
    
    while start < len(words):
        end = start + chunk_size  # Define chunk endpoint
        chunk = " ".join(words[start:end])  # Get words within the chunk
        chunks.append(chunk.strip())  # Strip extra spaces
        start += chunk_size - overlap  # Move start forward with overlap

    return chunks

def generate_embeddings(chunks, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks)
    return embeddings

def store_in_database(chunks, embeddings):
    with open("embeddings.csv", "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["text", "embedding"])
        for chunk, embedding in zip(chunks, embeddings):
            embedding = np.array(embedding) 
            writer.writerow([chunk, ",".join(map(str, embedding))])
    return

def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    normVector1 = np.linalg.norm(vector1)
    normVector2 = np.linalg.norm(vector2)
    similarity = dot_product / (normVector1 * normVector2)
    return similarity

def load_from_database(filepath):
    chunks = []
    embeddings = []
    with open(filepath, "r", newline="") as f:
        reader = csv.reader(f)
        next(reader)  # Skip header
        for row in reader:
            chunk = row[0]
            embedding = np.array(list(map(float, row[1].split(","))))
            chunks.append(chunk)
            embeddings.append(embedding)
    return chunks, np.array(embeddings)

def semantic_search(queryEmbedding, topK=5):
    dbChunks, dbEmbeddings = load_from_database("embeddings.csv")
    similarities = [cosine_similarity(dbEmbedding, queryEmbedding) for dbEmbedding in dbEmbeddings]
    topIndex = np.argsort(similarities)[-topK:][::-1]
    topChunks = [dbChunks[i] for i in topIndex]
    return topChunks

def insert_in_LMM_prompt(retrievedContext, query, model_name="gemini-1.5-flash-001"):
    prompt = f"""
        You are a helpful and responsible AI assistant providing professional guidance for healthcare staff.

        The user has provided a knowledge base with relevant medical training materials.

        Use only the retrieved context below to answer the question factually and safely.


        Context:
        {retrievedContext}

        Question:
        {query}

        Answer:
        """
    model = genai.GenerativeModel(model_name)
    response = model.generate_content(prompt)
    return response.text

def pipeline(filePath, query):
    text = extract_text_data(filePath)
    chunks = chunk_text(text)
    fileEmbeddings = generate_embeddings(chunks)
    store_in_database(chunks, fileEmbeddings)
    queryEmbeddings = generate_embeddings([query])[0]
    relevantData = semantic_search(queryEmbeddings)
    answer = insert_in_LMM_prompt(relevantData, query)
    return answer

def gradio_interface(file, question):
    return pipeline(file.name, question)

# Create the Gradio interface
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.File(label="Upload PDF"),
        gr.Textbox(label="Ask a Question")
    ],
    outputs="text",
    live=False,  # Disable live updates
    title="RAG System Web App",  # Title of the app
    description="Upload a PDF and ask a question to extract information from it.",  # Optional description
    allow_flagging="never", 
)

# Launch the interface
iface.launch()