JaMussCraft commited on
Commit
8800ef0
·
verified ·
1 Parent(s): 9669167

Upload folder using huggingface_hub

Browse files
.github/workflows/update_space.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Run Python script
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout
14
+ uses: actions/checkout@v2
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v2
18
+ with:
19
+ python-version: '3.9'
20
+
21
+ - name: Install Gradio
22
+ run: python -m pip install gradio
23
+
24
+ - name: Log in to Hugging Face
25
+ run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
26
+
27
+ - name: Deploy to Spaces
28
+ run: gradio deploy
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ venv/
2
+ scratch_space/
3
+ persisted_indexes/
4
+ uploaded_files/
5
+ rag_model_uky_rag_model_uky_libs_api.py
6
+ uploaded_files_state.json
7
+ ktem_app_data/
8
+ .gradio/
RAG UI.py ADDED
@@ -0,0 +1,562 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import csv
3
+ import random
4
+ import os
5
+ import shutil
6
+ import json
7
+ from llama_index.embeddings.openai import OpenAIEmbedding
8
+ from llama_index.core import (
9
+ VectorStoreIndex,
10
+ SimpleDirectoryReader,
11
+ StorageContext,
12
+ load_index_from_storage,
13
+ )
14
+ from llama_index.core.settings import Settings
15
+ import faiss
16
+ import numpy as np
17
+ from llama_index.vector_stores.faiss import FaissVectorStore
18
+ from llama_index.core.node_parser import SimpleNodeParser, SentenceSplitter
19
+ from llama_index.core.schema import Document
20
+ from llama_index.core.schema import IndexNode
21
+ from llama_index.core import ServiceContext
22
+ from llama_index.core.query_engine.retriever_query_engine import RetrieverQueryEngine
23
+ from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding
24
+ from llama_index.llms.openai import OpenAI
25
+ from transformers import BitsAndBytesConfig
26
+ from llama_index.core.prompts import PromptTemplate
27
+ import torch
28
+ import pandas as pd
29
+ import fitz
30
+ from transformers import pipeline
31
+ from sklearn.metrics.pairwise import cosine_similarity
32
+ from sklearn.feature_extraction.text import TfidfVectorizer
33
+
34
+
35
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
36
+
37
+ # os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
38
+ llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=512)
39
+ Settings.llm = llm
40
+
41
+ UPLOAD_DIR = "uploaded_files"
42
+ STATE_FILE = "uploaded_files_state.json"
43
+ PERSIST_DIR = "persisted_indexes"
44
+
45
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
46
+ os.makedirs(PERSIST_DIR, exist_ok=True)
47
+
48
+ # !!! why???
49
+ # torch.set_num_threads(1)
50
+ # torch.set_num_interop_threads(1)
51
+
52
+
53
+ def index_gen(file_path, index_name):
54
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
55
+
56
+ # One giant index: insertion example
57
+ # if os.path.exists('persisted_indexes/test1.faiss'):
58
+ # print("RUNNING TEST!")
59
+ # # Load document from file
60
+ # documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
61
+
62
+ # faiss_index = faiss.read_index('persisted_indexes/test1.faiss')
63
+ # embed_model = HuggingFaceEmbedding(
64
+ # model_name="BAAI/bge-small-en-v1.5"
65
+ # )
66
+ # Settings.embed_model = embed_model
67
+
68
+ # vector_store = FaissVectorStore(faiss_index=faiss_index)
69
+ # storage_context = StorageContext.from_defaults(
70
+ # persist_dir=PERSIST_DIR, vector_store=vector_store
71
+ # )
72
+
73
+ # index = load_index_from_storage(storage_context)
74
+ # print(index)
75
+ # for doc in documents:
76
+ # print('inserting ', doc)
77
+ # index.insert(doc)
78
+ # index.storage_context.persist(PERSIST_DIR)
79
+ # faiss.write_index(faiss_index, 'persisted_indexes/test1.faiss')
80
+ # print('insertion and persist complete!')
81
+ # return index
82
+
83
+
84
+ try:
85
+ # Load document from file
86
+ documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
87
+
88
+ # Initialize embedding model and vector store
89
+ embed_model = HuggingFaceEmbedding(
90
+ model_name="BAAI/bge-small-en-v1.5", device=device
91
+ )
92
+ Settings.embed_model = embed_model
93
+ embedding_dim = 384 # Ensure this matches the embedding model used
94
+
95
+ faiss_index = faiss.IndexFlatL2(embedding_dim)
96
+ vector_store = FaissVectorStore(faiss_index=faiss_index)
97
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
98
+
99
+ print(f"Number of documents to index: {len(documents)}.")
100
+
101
+ # Parse and index documents
102
+ parser = SentenceSplitter()
103
+ nodes = parser.get_nodes_from_documents(documents)
104
+ index = VectorStoreIndex(nodes, storage_context=storage_context)
105
+ print(f"Number of nodes generated:{len(nodes)}")
106
+
107
+ # individual index directory
108
+ index_directory = os.path.join(PERSIST_DIR, index_name)
109
+ os.makedirs(index_directory, exist_ok=True)
110
+ index_path = os.path.join(index_directory, f"{index_name}.faiss")
111
+
112
+
113
+ index.storage_context.persist(index_directory)
114
+ # index.storage_context.persist(PERSIST_DIR)
115
+ faiss.write_index(faiss_index, index_path)
116
+
117
+ if not os.path.exists(index_path):
118
+ raise FileNotFoundError(
119
+ f"FAISS index file not created at path: {index_path}"
120
+ )
121
+
122
+ return index_path
123
+
124
+ except Exception as e:
125
+ print(f"Error in index_gen with file {file_path}: {str(e)}")
126
+ return None
127
+
128
+
129
+ def save_uploaded_files_state(uploaded_files, indexed_files=None):
130
+ try:
131
+ state_file_json = {}
132
+ state_file_json["uploaded_files"] = list(uploaded_files)
133
+
134
+ if indexed_files:
135
+ state_file_json["indexed_files"] = list(indexed_files)
136
+
137
+ # else:
138
+ # # ??? why
139
+ # _, existing_indexed_files = load_uploaded_files_state()
140
+ # state_file_json["indexed_files"] = list(existing_indexed_files)
141
+
142
+ with open(STATE_FILE, "w") as f:
143
+ json.dump(state_file_json, f, indent=4)
144
+
145
+ except IOError as e:
146
+ print(f"Error saving uploaded files state: {str(e)}")
147
+
148
+
149
+ def load_uploaded_files_state():
150
+ try:
151
+ if os.path.exists(STATE_FILE):
152
+ with open(STATE_FILE, "r") as f:
153
+ state_data = json.load(f)
154
+ return set(state_data.get("uploaded_files", set())), set(
155
+ state_data.get("indexed_files", set())
156
+ )
157
+
158
+ except (IOError, json.JSONDecodeError) as e:
159
+ print(f"Error loading uploaded files state: {str(e)}")
160
+
161
+ return set(), set()
162
+
163
+
164
+ def save_file(file_path):
165
+ try:
166
+ file_name = os.path.basename(file_path)
167
+ server_save_path = os.path.join(UPLOAD_DIR, file_name)
168
+ shutil.copy(file_path, server_save_path)
169
+ return server_save_path
170
+
171
+ except (IOError, shutil.Error) as e:
172
+ print(f"Error saving file {file_path}: {str(e)}")
173
+ return None
174
+
175
+
176
+ with gr.Blocks() as demo:
177
+ gr.Markdown("## 📁 File Management & Chat Assistant")
178
+
179
+ with gr.Tabs():
180
+ # Tab 1: File Management
181
+ with gr.Tab("File Management"):
182
+ with gr.Row():
183
+ with gr.Column(scale=1):
184
+ file_upload = gr.File(
185
+ label="Upload PDF,JSON or TXT Files",
186
+ file_types=[".pdf", ".json", ".txt", "directory"],
187
+ file_count="multiple",
188
+ interactive=True,
189
+ )
190
+ file_table = gr.DataFrame(
191
+ headers=["Sr. No.", "File Name", "File Size"],
192
+ value=[],
193
+ interactive=False,
194
+ row_count=(4, "dynamic"),
195
+ wrap=True,
196
+ max_height=1000
197
+ )
198
+ file_checkbox = gr.CheckboxGroup(
199
+ label="Select Files to Index/Delete", choices=[]
200
+ )
201
+ select_all_button = gr.Button("Select All")
202
+ index_button = gr.Button("Index Selected Files")
203
+ delete_button = gr.Button("Delete Selected Files")
204
+
205
+ with gr.Column(scale=3):
206
+ message_box = gr.Markdown("")
207
+ chatbot = gr.Chatbot(label="LLM", type="messages")
208
+
209
+ with gr.Row():
210
+ chat_input = gr.Textbox(
211
+ show_label=False,
212
+ placeholder="Type your message here",
213
+ scale=8,
214
+ )
215
+ send_button = gr.Button("Send", scale=1)
216
+
217
+ # Tab 2: Indexed Files
218
+ with gr.Tab("Indexed Files"):
219
+ indexed_file_table = gr.DataFrame(
220
+ headers=["Indexed File", "Size"],
221
+ value=[],
222
+ interactive=False,
223
+ row_count=(4, "dynamic"),
224
+ )
225
+
226
+ # STATES
227
+ uploaded_files_state = gr.State(load_uploaded_files_state())
228
+
229
+ @delete_button.click(
230
+ inputs=[file_checkbox, uploaded_files_state, file_upload],
231
+ outputs=[file_table, file_checkbox, uploaded_files_state, indexed_file_table],
232
+ )
233
+ def delete_files(selected_files, uploaded_files_state, file_upload):
234
+ print("deleting files...: ", selected_files, uploaded_files_state, file_upload)
235
+
236
+ uploaded_files, indexed_files = uploaded_files_state
237
+
238
+ if not selected_files or not uploaded_files:
239
+ return gr.update(), selected_files, (uploaded_files, indexed_files)
240
+
241
+ # default return
242
+ # return [[]], selected_files, uploaded_files_state
243
+
244
+ # "we" means with extension
245
+ selected_file_names_we = [file.split(". ")[1] for file in selected_files]
246
+
247
+ for file_name_we in selected_file_names_we:
248
+ file_path = os.path.join(UPLOAD_DIR, file_name_we)
249
+ index_name = file_name_we.split(".")[0]
250
+ index_directory = os.path.join(PERSIST_DIR, index_name)
251
+ index_path = os.path.join(index_directory, f'{index_name}.faiss')
252
+ print(file_name_we, file_path, index_name, index_directory, index_path)
253
+
254
+ try:
255
+ if os.path.exists(file_path):
256
+ os.remove(file_path)
257
+ uploaded_files.remove(file_path)
258
+
259
+ else:
260
+ gr.Error(f"Could not delete file (File not found): {file_path}", duration=3)
261
+
262
+ if os.path.exists(index_directory):
263
+ shutil.rmtree(index_directory)
264
+ indexed_files.remove(index_path)
265
+
266
+ else:
267
+ gr.Error(f"Could not delete index directory (Path not found): {index_directory}", duration=3)
268
+
269
+ except Exception as e:
270
+ gr.Error(f"Error deleting {file_name_we}: {str(e)}", duration=3)
271
+
272
+ save_uploaded_files_state(uploaded_files, indexed_files)
273
+
274
+ file_info, checkbox_options = [], []
275
+ for idx, file_path in enumerate(uploaded_files, start=1):
276
+ file_name = os.path.basename(file_path)
277
+ file_size = os.path.getsize(file_path)
278
+ file_info.append([idx, file_name, f"{round(file_size / 1024, 2)} KB"])
279
+ checkbox_options.append(f"{idx}. {file_name}")
280
+
281
+ indexed_file_display = [
282
+ [
283
+ os.path.basename(index_path).split(".")[0],
284
+ f"{round(os.path.getsize(index_path) / 1024, 2)} KB",
285
+ ]
286
+ for index_path in indexed_files
287
+ ]
288
+
289
+ return (
290
+ file_info,
291
+ gr.update(choices=checkbox_options, value=[]),
292
+ (uploaded_files, indexed_files),
293
+ indexed_file_display,
294
+ )
295
+
296
+ @chat_input.submit(
297
+ inputs=[chat_input, chatbot, uploaded_files_state],
298
+ outputs=[chat_input, chatbot],
299
+ )
300
+ @send_button.click(
301
+ inputs=[chat_input, chatbot, uploaded_files_state],
302
+ outputs=[chat_input, chatbot],
303
+ )
304
+ # Chat function with improved SQuAD matching
305
+ def chat_with_bot(user_input, chat_history, uploaded_files_state):
306
+ if not user_input:
307
+ return user_input, chat_history
308
+
309
+ _, indexed_files = uploaded_files_state
310
+
311
+ chat_history.append(
312
+ {
313
+ "role": "user",
314
+ "content": user_input,
315
+ }
316
+ )
317
+
318
+ response = "I do not have the answer. Please upload and index relevant files first."
319
+ file_with_answer = None
320
+ custom_prompt = PromptTemplate(
321
+ template=(
322
+ "Use the following context to answer the query. Do not use outside knowledge. "
323
+ "If the answer is not found in the context, respond with: 'I do not have the answer.'\n"
324
+ "Context: {context_str}\n"
325
+ "Query: {query_str}\n"
326
+ "Answer:"
327
+ )
328
+ )
329
+
330
+ if not index_files:
331
+ response = "No files have been indexed for answering this question."
332
+
333
+ try:
334
+ for index_path in indexed_files:
335
+ print('checking ', index_path)
336
+ file_name = os.path.basename(index_path)
337
+ index_name = file_name.split(".")[0]
338
+
339
+ if not os.path.exists(index_path):
340
+ print(f"FAISS index not found at {index_path}, skipping...")
341
+ continue
342
+
343
+ storage_context = None
344
+ try:
345
+ faiss_index = faiss.read_index(index_path)
346
+ embed_model = HuggingFaceEmbedding(
347
+ model_name="BAAI/bge-small-en-v1.5"
348
+ )
349
+ Settings.embed_model = embed_model
350
+
351
+ vector_store = FaissVectorStore(faiss_index=faiss_index)
352
+ storage_context = StorageContext.from_defaults(
353
+ persist_dir=f'{PERSIST_DIR}/{index_name}', vector_store=vector_store
354
+ )
355
+
356
+ except Exception as e:
357
+ raise RuntimeError(
358
+ f"Failed to load FAISS index at {index_path}: {str(e)}"
359
+ )
360
+
361
+ # print(get_global("embed_model"))
362
+
363
+ index = load_index_from_storage(storage_context)
364
+ print(f"Index loaded with {len(index.docstore.docs)} documents.")
365
+
366
+ retriever = index.as_retriever(similarity_top_k=10)
367
+ query_engine = RetrieverQueryEngine(retriever=retriever)
368
+ query_engine.update_prompts(
369
+ {"response_synthesizer:text_qa_template": custom_prompt}
370
+ )
371
+
372
+ # Query the index for the user input
373
+ query_result = query_engine.query(user_input)
374
+ print("query result: ", query_result)
375
+
376
+ if query_result.response.strip() != "I do not have the answer.":
377
+ response = f"{query_result.response} \n\n Source: {file_name}"
378
+ # response = f"Answer from indexed file '{file_name}': {query_result.response}"
379
+ file_with_answer = file_name
380
+ break
381
+
382
+ else:
383
+ response = "I do not have the answer."
384
+
385
+ except Exception as e:
386
+ response = f"Error querying the index: {str(e)}"
387
+ print(response)
388
+
389
+ chat_history.append(
390
+ {
391
+ "role": "assistant",
392
+ "content": response,
393
+ }
394
+ )
395
+
396
+ return gr.update(value=""), chat_history
397
+
398
+ @index_button.click(
399
+ inputs=[file_checkbox, uploaded_files_state, indexed_file_table],
400
+ outputs=[
401
+ file_checkbox,
402
+ uploaded_files_state,
403
+ indexed_file_table,
404
+ select_all_button,
405
+ ],
406
+ )
407
+ def index_files(selected_files, uploaded_files_state, indexed_file_table):
408
+ uploaded_files, indexed_files = uploaded_files_state
409
+ print("indexing files...", selected_files, uploaded_files_state)
410
+
411
+ if not selected_files or not uploaded_files:
412
+ gr.Warning("Please select or upload files for indexing.", duration=3)
413
+ return (
414
+ selected_files,
415
+ uploaded_files_state,
416
+ indexed_file_table,
417
+ gr.update(),
418
+ )
419
+
420
+
421
+ files_to_index = []
422
+ for file in selected_files:
423
+ file_name_we = file.split(". ")[1]
424
+ file_path = os.path.join(UPLOAD_DIR, file_name_we)
425
+ index_name = file_name_we.split(".")[0]
426
+ index_directory = os.path.join(PERSIST_DIR, index_name)
427
+ index_path = os.path.join(index_directory, f'{index_name}.faiss')
428
+
429
+ if index_path not in indexed_files:
430
+ files_to_index.append(file_path)
431
+ else:
432
+ gr.Info(
433
+ f"File '{os.path.basename(file_path)}' is already indexed.",
434
+ duration=3,
435
+ )
436
+
437
+ for file_path in files_to_index:
438
+ try:
439
+ file_name = os.path.basename(file_path)
440
+ index_name = file_name.split(".")[0]
441
+ index_path = index_gen(file_path, index_name)
442
+ gr.Info(f"Successfully indexed: {file_name}", duration=3)
443
+
444
+ # Save indexed file info for persistence
445
+ # index_path = os.path.join(PERSIST_DIR, f"{index_name}.faiss")
446
+ indexed_files.add(index_path)
447
+
448
+ except Exception as e:
449
+ gr.Error(f"Error indexing {file_path}: {str(e)}", duration=3)
450
+
451
+ # Update the state with new indexed files
452
+ save_uploaded_files_state(uploaded_files, indexed_files)
453
+
454
+ # Convert indexed file info to display format
455
+ indexed_file_display = [
456
+ [
457
+ os.path.basename(index_path).split(".")[0],
458
+ f"{round(os.path.getsize(index_path) / 1024, 2)} KB",
459
+ ]
460
+ for index_path in indexed_files
461
+ ]
462
+
463
+ return (
464
+ gr.update(value=[]),
465
+ (uploaded_files, indexed_files),
466
+ indexed_file_display,
467
+ gr.update(value="Select All"),
468
+ )
469
+
470
+ @select_all_button.click(
471
+ inputs=[uploaded_files_state, select_all_button, file_checkbox],
472
+ outputs=[file_checkbox, select_all_button],
473
+ )
474
+ def select_all_checkbox(uploaded_files_state, select_all_button, file_checkbox):
475
+ uploaded_files, _ = uploaded_files_state
476
+
477
+ if not uploaded_files:
478
+ return file_checkbox, select_all_button
479
+
480
+ button_value = ""
481
+ if select_all_button == "Select All":
482
+ button_value = "Unselect All"
483
+ else:
484
+ button_value = "Select All"
485
+
486
+ checkbox_options = []
487
+ if not file_checkbox:
488
+ checkbox_options = [
489
+ f"{idx + 1}. {os.path.basename(file)}"
490
+ for idx, file in enumerate(uploaded_files)
491
+ ]
492
+
493
+ return gr.update(value=checkbox_options), gr.update(value=button_value)
494
+
495
+ # Load initial state when app starts
496
+ @demo.load(
497
+ inputs=[uploaded_files_state],
498
+ outputs=[file_table, file_checkbox, uploaded_files_state, indexed_file_table],
499
+ )
500
+ def load_state_on_start(uploaded_files_state):
501
+ uploaded_files, indexed_files = load_uploaded_files_state()
502
+
503
+ print("demo loading...", uploaded_files, indexed_files)
504
+
505
+ # Populate uploaded files table and checkbox options
506
+ file_info = []
507
+ checkbox_options = []
508
+ for idx, server_file_path in enumerate(uploaded_files, start=1):
509
+ file_name = os.path.basename(server_file_path)
510
+ file_size = os.path.getsize(server_file_path)
511
+ file_info.append([idx, file_name, f"{round(file_size / 1024, 2)} KB"])
512
+ checkbox_options.append(f"{idx}. {file_name}")
513
+
514
+ # Populate indexed files table
515
+ indexed_file_display = [
516
+ [
517
+ os.path.basename(index_path).split(".")[0],
518
+ f"{round(os.path.getsize(index_path) / 1024, 2)} KB",
519
+ ]
520
+ for index_path in indexed_files
521
+ ]
522
+
523
+ return (
524
+ file_info,
525
+ gr.update(choices=checkbox_options),
526
+ (uploaded_files, indexed_files),
527
+ indexed_file_display,
528
+ )
529
+
530
+ @file_upload.upload(
531
+ inputs=[file_upload, uploaded_files_state],
532
+ outputs=[file_table, file_checkbox, file_upload, uploaded_files_state],
533
+ )
534
+ def upload_files(file_upload, uploaded_files_state):
535
+ uploaded_files, indexed_files = uploaded_files_state
536
+
537
+ for file_path in file_upload:
538
+ server_save_path = save_file(file_path)
539
+ if server_save_path:
540
+ uploaded_files.add(server_save_path)
541
+
542
+ save_uploaded_files_state(uploaded_files)
543
+
544
+ file_info = []
545
+ checkbox_options = []
546
+ for i, file_path in enumerate(uploaded_files, start=1):
547
+ file_name = os.path.basename(file_path)
548
+ file_size = os.path.getsize(file_path)
549
+ file_info.append([i, file_name, f"{round(file_size / 1024, 2)} KB"])
550
+ checkbox_options.append(f"{i}. {file_name}")
551
+
552
+ gr.Info("Successfully uploaded file(s).", duration=3)
553
+
554
+ return (
555
+ file_info,
556
+ gr.update(choices=checkbox_options),
557
+ [],
558
+ (uploaded_files, indexed_files),
559
+ )
560
+
561
+
562
+ demo.launch(share=True)
README.md CHANGED
@@ -1,12 +1,28 @@
1
  ---
2
- title: RAG Lib
3
- emoji: 📈
4
- colorFrom: blue
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 5.18.0
8
- app_file: app.py
9
- pinned: false
10
  ---
 
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: RAG-Lib
3
+ app_file: RAG UI.py
 
 
4
  sdk: gradio
5
+ sdk_version: 5.15.0
 
 
6
  ---
7
+ # RAG-Lib
8
+ RAG gor UKY Libs.
9
 
10
+ ## Setup
11
+ 1. Install the following libraries
12
+
13
+ ```py
14
+ python -m pip install -r /workspaces/RAG-Lib/requirements.txt
15
+ ```
16
+
17
+ 2. Set the environment variable "OPENAI_API_KEY"
18
+
19
+ ## Running Kotaemon
20
+ ```bash
21
+ docker run \
22
+ -e GRADIO_SERVER_NAME=0.0.0.0 \
23
+ -e GRADIO_SERVER_PORT=7860 \
24
+ -v ./ktem_app_data:/app/ktem_app_data \
25
+ -p 7860:7860 -it --rm \
26
+ --platform linux/amd64 \
27
+ ghcr.io/cinnamon/kotaemon:main-lite
28
+ ```
rag_model_uky_libs_api.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import csv
3
+
4
+ from llama_index.core.schema import Document
5
+ from llama_index.core.node_parser import SimpleNodeParser, SentenceSplitter
6
+ from llama_index.core.schema import IndexNode
7
+ from llama_index.core import ServiceContext
8
+ from llama_index.core.query_engine.retriever_query_engine import RetrieverQueryEngine
9
+ from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding
10
+ from llama_index.vector_stores.faiss import FaissVectorStore
11
+ from llama_index.core import SimpleDirectoryReader, StorageContext, VectorStoreIndex
12
+ import faiss
13
+ from llama_index.core.settings import Settings
14
+ from llama_index.llms.openai import OpenAI
15
+ from transformers import BitsAndBytesConfig
16
+ from llama_index.core.prompts import PromptTemplate
17
+ import torch
18
+ import pandas as pd
19
+ import os
20
+ import json
21
+ #from IPython.display import Markdown, display
22
+
23
+ os.environ["OPENAI_API_KEY"]='use a registered OpenAI API key'
24
+
25
+
26
+ # def get_query_data(file_name):
27
+ # # with open("unique_questions.tsv",'r', encoding='UTF-8') as file:
28
+ # # tsv_file = csv.reader(file, delimiter="\t")
29
+
30
+ # df = pd.read_csv(file_name, delimiter="\t")
31
+ # n = len(df)
32
+ # query_ls = []
33
+ # for i in range (n):
34
+ # row = df.iloc[i]
35
+ # query = row['questions']
36
+ # query_ls.append(query)
37
+
38
+ # return query_ls
39
+
40
+ def get_squad_question():
41
+ with open('data_sample/squad_dev-v2.0.json', 'r') as file:
42
+ squad = json.load(file)
43
+
44
+ node_ls = []
45
+ question_ls = []
46
+
47
+ for i in range (len(squad['data'])):
48
+ x = len(squad['data'][i]['paragraphs'])
49
+ for j in range(x):
50
+ #context = squad['data'][i]['paragraphs'][j]['context']
51
+ #node_ls.append(context)
52
+ y = len(squad['data'][i]['paragraphs'][j]['qas'])
53
+ for k in range (y):
54
+ ques = squad['data'][i]['paragraphs'][j]['qas'][k]['question']
55
+ question_ls.append(ques)
56
+
57
+ return question_ls
58
+
59
+ def index_gen(file_name):
60
+ #documents = SimpleDirectoryReader(input_files=["squad_contexts.json"]).load_data()
61
+ #documents = SimpleDirectoryReader(input_files=["data_sample/Library_data/accessions_content+uri.json"]).load_data()
62
+ reader = SimpleDirectoryReader(input_dir="data_sample/Library_data/ASpaceAccessions/IRC_accessions_txt", required_exts=[".txt"])
63
+ documents = reader.load_data()
64
+
65
+ embedding_dim = 384
66
+
67
+ #dimension = embedding_dim # Ensure this matches your embedding model's output dimension
68
+ faiss_index = faiss.IndexFlatL2(embedding_dim)
69
+ vector_store = FaissVectorStore(faiss_index=faiss_index)
70
+ storage_context = StorageContext.from_defaults(vector_store=vector_store)
71
+
72
+ parser = SentenceSplitter()
73
+ nodes = parser.get_nodes_from_documents(documents)
74
+ for node, doc in zip(nodes, documents):
75
+ node.metadata["source"] = doc.metadata.get("file_name", "unknown")
76
+ index = VectorStoreIndex(nodes, storage_context = storage_context)
77
+
78
+ return index
79
+
80
+
81
+ torch.cuda.empty_cache()
82
+ device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
83
+ file_name = "unique_questions.tsv"
84
+ embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
85
+ Settings.embed_model = embed_model
86
+
87
+
88
+ custom_prompt = PromptTemplate(
89
+ template=(
90
+ "Use the following context to answer the query. Do not use outside knowledge. "
91
+ "If the answer is not found in the context, respond with: 'I do not have the answer.'\n"
92
+ "Context: {context_str}\n"
93
+ "Query: {query_str}\n"
94
+ "Answer:"
95
+ )
96
+ )
97
+
98
+ llm = OpenAI(temperature=0, model="gpt-4o-mini", max_tokens=512)
99
+
100
+ Settings.llm = llm
101
+ Settings.chunk_size = 512
102
+ Settings.chunk_overlap = 10
103
+
104
+ file_name = "data_sample/Library_data/new_IRC_accessions.json"
105
+
106
+ index = index_gen(file_name)
107
+
108
+ nodes = index.docstore.docs.values()
109
+
110
+
111
+ retriever = index.as_retriever(similarity_top_k=2)
112
+ #query_engine = index.as_query_engine()
113
+ query_engine = RetrieverQueryEngine(retriever=retriever)
114
+
115
+ query_engine.update_prompts({"response_synthesizer:text_qa_template": custom_prompt})
116
+
117
+ prompts_dict = query_engine.get_prompts()
118
+
119
+ # Example queries from a part of the UKY Libraries data
120
+ response1 = query_engine.query("When did the Faculty council and curriculam committee establish?")
121
+ y = response1.metadata.values()
122
+ x = list(y)
123
+ file_name_res1 = x[0]['file_name']
124
+ #response1.source_nodes[0].metadata['file_name'] % another way to retrieve file name
125
+ response2 = query_engine.query("Who did design the house in the photograph?")
126
+ y1 = response2.metadata.values()
127
+ x1 = list(y1)
128
+ file_name_res2 = x1[0]['file_name']
129
+
130
+ response3 = query_engine.query("What are the contents available on Ferrel Wellman's career?")
131
+ y2 = response3.metadata.values()
132
+ x2 = list(y2)
133
+ file_name_res3 = x2[0]['file_name']
134
+
135
+ response4 = query_engine.query("How many boxes of materials related to Good food Co-op are available in the library?")
136
+ y3 = response4.metadata.values()
137
+ x3 = list(y3)
138
+ file_name_res4 = x3[0]['file_name']
139
+
140
+ response5 = query_engine.query("What was the purpose of the KMF?")
141
+ y4 = response5.metadata.values()
142
+ x4 = list(y4)
143
+ file_name_res5 = x4[0]['file_name']
144
+
145
+ print(response1.response, file_name_res1)
146
+ print("\n")
147
+ print(response2.response, file_name_res2)
148
+ print("\n")
149
+ print(response3.response, file_name_res3)
150
+ print("\n")
151
+ print(response4.response, file_name_res4)
152
+ print("\n")
153
+ print(response5.response, file_name_res5)
154
+
155
+
156
+
157
+
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ llama-index==0.11.3
3
+ transformers
4
+ llama-index-embeddings-huggingface
5
+ llama-index-vector-stores-faiss
6
+ faiss-cpu
7
+ llama-index-llms-openai
8
+ torch
9
+ numpy
10
+ scikit-learn
11
+ pymupdf
12
+ pandas
13
+ bitsandbytes
reset.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ rm -rf uploaded_files/
4
+ rm -rf persisted_indexes/
5
+
6
+ rm -f "uploaded_files_state.json"
7
+
8
+ echo "Reset completed."