iaravagni commited on
Commit
0730116
·
1 Parent(s): c76be51

app update

Browse files
Files changed (1) hide show
  1. app.py +109 -4
app.py CHANGED
@@ -1,7 +1,112 @@
1
  import gradio as gr
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import numpy as np
3
+ from pypdf import PdfReader
4
+ import re
5
+ from sentence_transformers import SentenceTransformer
6
+ import csv
7
+ import google.generativeai as genai
8
 
9
+ # Configure your API key
10
+ genai.configure(api_key="AIzaSyBgsd2j_InSYc7Zm8qIIe7yqWPworfbCS8")
11
 
12
+ def extract_text_data(path):
13
+ reader = PdfReader(path)
14
+ text = ''
15
+ for page in reader.pages:
16
+ text += page.extract_text()
17
+ return text
18
+
19
+ def clean_text(text):
20
+ text = text.replace('\u2029\u2029', '\n')
21
+ text = text.replace('\u2029', ' ')
22
+ text = text.replace('\u2010', '-')
23
+ text = text.replace(r"\'", "'")
24
+ return text
25
+
26
+ def chunk_text(text):
27
+ clean = clean_text(text)
28
+ paragraphs = re.split(r'\n', clean)
29
+ paragraphs = [p.strip() for p in paragraphs if p.strip()]
30
+ return paragraphs
31
+
32
+ def generate_embeddings(chunks, model_name="all-MiniLM-L6-v2"):
33
+ model = SentenceTransformer(model_name)
34
+ embeddings = model.encode(chunks)
35
+ return embeddings
36
+
37
+ def store_in_database(chunks, embeddings):
38
+ with open("embeddings.csv", "w", newline="") as f:
39
+ writer = csv.writer(f)
40
+ writer.writerow(["text", "embedding"])
41
+ for chunk, embedding in zip(chunks, embeddings):
42
+ embedding = np.array(embedding)
43
+ writer.writerow([chunk, ",".join(map(str, embedding))])
44
+ return
45
+
46
+ def cosine_similarity(vector1, vector2):
47
+ dot_product = np.dot(vector1, vector2)
48
+ normVector1 = np.linalg.norm(vector1)
49
+ normVector2 = np.linalg.norm(vector2)
50
+ similarity = dot_product / (normVector1 * normVector2)
51
+ return similarity
52
+
53
+ def load_from_database(filepath):
54
+ chunks = []
55
+ embeddings = []
56
+ with open(filepath, "r", newline="") as f:
57
+ reader = csv.reader(f)
58
+ next(reader) # Skip header
59
+ for row in reader:
60
+ chunk = row[0]
61
+ embedding = np.array(list(map(float, row[1].split(","))))
62
+ chunks.append(chunk)
63
+ embeddings.append(embedding)
64
+ return chunks, np.array(embeddings)
65
+
66
+ def semantic_search(queryEmbedding, topK=3):
67
+ dbChunks, dbEmbeddings = load_from_database("embeddings.csv")
68
+ similarities = [cosine_similarity(dbEmbedding, queryEmbedding) for dbEmbedding in dbEmbeddings]
69
+ topIndex = np.argsort(similarities)[-topK:][::-1]
70
+ topChunks = [dbChunks[i] for i in topIndex]
71
+ return topChunks
72
+
73
+ def insert_in_LMM_prompt(retrievedContext, query, model_name="gemini-1.5-flash-001"):
74
+ prompt = f"""
75
+ You are an AI assistant answering a user's query based on retrieved knowledge.
76
+
77
+ Context:
78
+ {retrievedContext}
79
+
80
+ Question:
81
+ {query}
82
+
83
+ Answer:
84
+ """
85
+ model = genai.GenerativeModel(model_name)
86
+ response = model.generate_content(prompt)
87
+ return response.text
88
+
89
+ def pipeline(filePath, query):
90
+ text = extract_text_data(filePath)
91
+ chunks = chunk_text(text)
92
+ fileEmbeddings = generate_embeddings(chunks)
93
+ store_in_database(chunks, fileEmbeddings)
94
+ queryEmbeddings = generate_embeddings([query])[0]
95
+ relevantData = semantic_search(queryEmbeddings)
96
+ answer = insert_in_LMM_prompt(relevantData, query)
97
+ return answer
98
+
99
+ def gradio_interface(file, question):
100
+ return pipeline(file.name, question)
101
+
102
+ iface = gr.Interface(
103
+ fn=gradio_interface,
104
+ inputs=[
105
+ gr.File(label="Upload PDF"),
106
+ gr.Textbox(label="Ask a Question")
107
+ ],
108
+ outputs="text",
109
+ live=True
110
+ )
111
+
112
+ iface.launch()