Kshitij-369 commited on
Commit
e067ea8
·
1 Parent(s): 2c00e1d
Files changed (2) hide show
  1. app.py +209 -0
  2. requirements.txt +12 -0
app.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import re
3
+ import cohere
4
+ import numpy as np
5
+ import textwrap
6
+ import os
7
+ import pandas as pd
8
+ import requests
9
+ import fitz
10
+ from tqdm.auto import tqdm
11
+ from spacy.lang.en import English
12
+ from pinecone import Pinecone, ServerlessSpec
13
+
14
+ # Retrieve the API keys from environment variables
15
+ COHERE_KEY = os.getenv('COHERE_API_KEY')
16
+ PINECONE_KEY = os.getenv('PINECONE_API_KEY')
17
+
18
+ # Initialize global variables
19
+ co = cohere.Client('COHERE_API_KEY')
20
+ pc = Pinecone(api_key='PINECONE_API_KEY')
21
+ index_name = 'cohere-pinecone'
22
+ nlp = English()
23
+ nlp.add_pipe("sentencizer")
24
+
25
+ def text_formatter(text: str) -> str:
26
+ return text.replace("\n", " ").strip()
27
+
28
+ def open_and_read_pdf(pdf_path: str, page_offset: int = 0) -> list[dict]:
29
+ doc = fitz.open(pdf_path)
30
+ pages_and_texts = []
31
+ for page_number, page in enumerate(doc):
32
+ text = page.get_text()
33
+ text = text_formatter(text)
34
+ pages_and_texts.append({
35
+ "page_number": page_number - page_offset,
36
+ "page_char_count": len(text),
37
+ "page_word_count": len(text.split(" ")),
38
+ "page_sentence_count_raw": len(text.split(". ")),
39
+ "page_token_count": len(text) / 4,
40
+ "text": text
41
+ })
42
+ return pages_and_texts
43
+
44
+ def split_list(input_list: list, slice_size: int) -> list[list[str]]:
45
+ return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]
46
+
47
+ def process_pdf(pdf_path):
48
+ pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
49
+
50
+ for item in pages_and_texts:
51
+ item["sentences"] = [str(sentence) for sentence in nlp(item["text"]).sents]
52
+ item["page_sentence_count_spacy"] = len(item["sentences"])
53
+ item["sentence_chunks"] = split_list(input_list=item["sentences"], slice_size=10)
54
+ item["num_chunks"] = len(item["sentence_chunks"])
55
+
56
+ pages_and_chunks = []
57
+ for item in pages_and_texts:
58
+ for sentence_chunk in item["sentence_chunks"]:
59
+ chunk_dict = {
60
+ "page_number": item["page_number"],
61
+ "sentence_chunk": "".join(sentence_chunk).replace(" ", " ").strip(),
62
+ }
63
+ chunk_dict["sentence_chunk"] = re.sub(r'\.([A-Z])', r'. \1', chunk_dict["sentence_chunk"])
64
+ chunk_dict["chunk_char_count"] = len(chunk_dict["sentence_chunk"])
65
+ chunk_dict["chunk_word_count"] = len(chunk_dict["sentence_chunk"].split(" "))
66
+ chunk_dict["chunk_token_count"] = len(chunk_dict["sentence_chunk"]) / 4
67
+ pages_and_chunks.append(chunk_dict)
68
+
69
+ df = pd.DataFrame(pages_and_chunks)
70
+ pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > 30].to_dict(orient="records")
71
+
72
+ text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]
73
+
74
+ embeds = co.embed(
75
+ texts=text_chunks,
76
+ model='embed-english-v2.0',
77
+ input_type='search_query',
78
+ truncate='END'
79
+ ).embeddings
80
+
81
+ if index_name not in pc.list_indexes().names():
82
+ pc.create_index(
83
+ name=index_name,
84
+ dimension=len(embeds[0]),
85
+ metric="cosine",
86
+ spec=ServerlessSpec(cloud='aws', region='us-east-1')
87
+ )
88
+
89
+ index = pc.Index(index_name)
90
+
91
+ ids = [str(i) for i in range(len(embeds))]
92
+ meta = [{'text': text} for text in text_chunks]
93
+ to_upsert = list(zip(ids, embeds, meta))
94
+
95
+ batch_size = 128
96
+ for i in range(0, len(embeds), batch_size):
97
+ i_end = min(i+batch_size, len(embeds))
98
+ index.upsert(vectors=to_upsert[i:i_end])
99
+
100
+ return "PDF processed and indexed successfully!"
101
+
102
+ def search_queries(queries: list[str], k: int = 1) -> str:
103
+ query_embeddings = co.embed(
104
+ texts=queries,
105
+ model='embed-english-v2.0',
106
+ input_type='search_query',
107
+ truncate='END'
108
+ ).embeddings
109
+
110
+ index = pc.Index(index_name)
111
+ all_results = {}
112
+
113
+ for i, query_embedding in enumerate(query_embeddings):
114
+ res = index.query(vector=query_embedding, top_k=k, include_metadata=True)
115
+ all_results[queries[i]] = res['matches']
116
+
117
+ result_str = ""
118
+
119
+ for query, matches in all_results.items():
120
+ result_str += f"Results for Query: {query}\n\n"
121
+
122
+ for match in matches:
123
+ text = match['metadata']['text']
124
+ result_str += f"{text}\n{'-'*50}\n\n"
125
+
126
+ result_str += f"\n{'='*100}\n\n"
127
+
128
+ return result_str
129
+
130
+ def chatbot(message, history):
131
+ if not message.strip():
132
+ return "Please enter a valid query."
133
+
134
+ # Split the message into multiple queries
135
+ queries = [q.strip() for q in message.split('||') if q.strip()]
136
+
137
+ if not queries:
138
+ return "Please enter at least one valid query."
139
+
140
+ results = []
141
+ for query in queries:
142
+ result = search_queries([query])
143
+ results.append(f"Query: {query}\n\n{result}")
144
+
145
+ return "\n\n---\n\n".join(results)
146
+
147
+ def clear_index():
148
+ try:
149
+ pc.delete_index(index_name)
150
+ return "Pinecone index cleared successfully!"
151
+ except Exception as e:
152
+ return f"Error clearing Pinecone index: {str(e)}"
153
+
154
+ def upload_pdf(file):
155
+ if file is None:
156
+ return "Please upload a PDF file."
157
+
158
+ file_path = file.name
159
+ result = process_pdf(file_path)
160
+ return result
161
+
162
+
163
+ demo = gr.Blocks()
164
+
165
+ with demo:
166
+
167
+ gr.Markdown("# PDF Chatbot with Multi-Query Support")
168
+
169
+ gr.Markdown("""
170
+ ## How to use:
171
+ 1. Upload a PDF and click "Process PDF".
172
+ 2. Enter your queries in the chat below.
173
+ 3. For multiple queries, separate them with '||'.
174
+
175
+ Example: What are macronutrients? || What is the role of vitamins?
176
+ """)
177
+
178
+ with gr.Row():
179
+ with gr.Column(scale=2):
180
+ pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
181
+ with gr.Column(scale=1):
182
+ process_button = gr.Button("Process PDF")
183
+ clear_button = gr.Button("Clear Index")
184
+
185
+ status_output = gr.Textbox(label="Status")
186
+
187
+
188
+ chatbot_interface = gr.ChatInterface(
189
+ fn=chatbot,
190
+ chatbot=gr.Chatbot(height=500),
191
+ textbox=gr.Textbox(placeholder="Enter your query here...", container=False, scale=7),
192
+ submit_btn="Send",
193
+ clear_btn="🗑️ Clear",
194
+ retry_btn="🔄 Retry",
195
+ undo_btn="↩️ Undo",
196
+ theme="soft",
197
+ examples=[
198
+ "What are macronutrients?",
199
+ "What is the role of vitamins? || How do minerals affect health?",
200
+ "Define protein || Define carbohydrates || Define fats"
201
+ ],
202
+ )
203
+
204
+ clear_button = gr.Button("Clear Index")
205
+
206
+ process_button.click(upload_pdf, inputs=[pdf_upload], outputs=[status_output])
207
+ clear_button.click(clear_index, inputs=None, outputs=[status_output])
208
+
209
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ gradio
3
+ cohere
4
+ numpy
5
+ pandas
6
+ requests
7
+ PyMuPDF
8
+ tqdm
9
+ spacy
10
+ pinecone-client
11
+ accelerate
12
+ bitsandbytes