Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,13 +15,18 @@ from rank_bm25 import BM25Okapi
|
|
| 15 |
from gensim.models import Word2Vec
|
| 16 |
from typing import List, Optional, Tuple
|
| 17 |
import gradio as gr
|
| 18 |
-
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
logging.basicConfig(level=logging.INFO)
|
| 21 |
|
| 22 |
api_key = os.getenv("MISTRAL_API_KEY")
|
| 23 |
client = Mistral(api_key=api_key)
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10, max_delay=60):
|
| 26 |
embeddings = []
|
| 27 |
for text in text_list:
|
|
@@ -48,30 +53,55 @@ def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=1
|
|
| 48 |
return embeddings
|
| 49 |
|
| 50 |
def store_embeddings_in_vector_db(
|
| 51 |
-
|
| 52 |
vector_db_path: str,
|
| 53 |
annoy_index_path: str,
|
| 54 |
chunk_size: int = 2048,
|
| 55 |
overlap: int = 200,
|
| 56 |
num_trees: int = 10
|
| 57 |
):
|
| 58 |
-
doc = fitz.open(pdf_path)
|
| 59 |
-
all_embeddings = []
|
| 60 |
all_texts = []
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
embeddings_np = np.array(all_embeddings).astype('float32')
|
| 77 |
with open(vector_db_path, "wb") as f:
|
|
@@ -327,7 +357,7 @@ def chatbot_interface(file, user_query, response_style):
|
|
| 327 |
selected_reranking_methods=selected_reranking_methods_list
|
| 328 |
))
|
| 329 |
|
| 330 |
-
formatted_response = f"Response
|
| 331 |
formatted_response += "Retrieved and Reranked Documents:\n"
|
| 332 |
for idx, doc_info in enumerate(source_info, start=1):
|
| 333 |
formatted_response += f"\nDocument {idx}:\n"
|
|
@@ -335,7 +365,6 @@ def chatbot_interface(file, user_query, response_style):
|
|
| 335 |
formatted_response += f"Retrieval Method: {doc_info['method']}\n"
|
| 336 |
if 'score' in doc_info:
|
| 337 |
formatted_response += f"Precision Score: {doc_info['score']:.4f}\n"
|
| 338 |
-
|
| 339 |
return formatted_response
|
| 340 |
|
| 341 |
iface = gr.Blocks(theme="Rabbitt-AI/ChanceRAG")
|
|
@@ -351,7 +380,7 @@ with iface:
|
|
| 351 |
"Detailed", "Concise", "Creative", "Technical"], label="Response Style"
|
| 352 |
),
|
| 353 |
],
|
| 354 |
-
outputs= gr.
|
| 355 |
)
|
| 356 |
|
| 357 |
iface.launch(share=True)
|
|
|
|
| 15 |
from gensim.models import Word2Vec
|
| 16 |
from typing import List, Optional, Tuple
|
| 17 |
import gradio as gr
|
| 18 |
+
import moviepy.editor as mp
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
logging.basicConfig(level=logging.INFO)
|
| 21 |
|
| 22 |
api_key = os.getenv("MISTRAL_API_KEY")
|
| 23 |
client = Mistral(api_key=api_key)
|
| 24 |
|
| 25 |
+
from deepgram import Deepgram
|
| 26 |
+
|
| 27 |
+
dg_api_key = os.getenv("DEEPGRAM_API_KEY")
|
| 28 |
+
deepgram = Deepgram(dg_api_key)
|
| 29 |
+
|
| 30 |
def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10, max_delay=60):
|
| 31 |
embeddings = []
|
| 32 |
for text in text_list:
|
|
|
|
| 53 |
return embeddings
|
| 54 |
|
| 55 |
def store_embeddings_in_vector_db(
|
| 56 |
+
file_path: str,
|
| 57 |
vector_db_path: str,
|
| 58 |
annoy_index_path: str,
|
| 59 |
chunk_size: int = 2048,
|
| 60 |
overlap: int = 200,
|
| 61 |
num_trees: int = 10
|
| 62 |
):
|
|
|
|
|
|
|
| 63 |
all_texts = []
|
| 64 |
+
if file_path.endswith(('.pdf', '.doc', '.docx' , '.pptx' , '.ppt' , '.xls', '.xlsx' , '.txt' )):
|
| 65 |
+
doc = fitz.open(file_path)
|
| 66 |
+
all_embeddings = []
|
| 67 |
+
total_pages = doc.page_count
|
| 68 |
+
logging.info(f"Processing PDF/DOC: {file_path} with {total_pages} pages.")
|
| 69 |
+
|
| 70 |
+
for page_num in range(total_pages):
|
| 71 |
+
page = doc.load_page(page_num)
|
| 72 |
+
text = page.get_text()
|
| 73 |
+
if text.strip():
|
| 74 |
+
chunks = split_text_into_chunks(text, chunk_size, overlap)
|
| 75 |
+
embeddings = get_text_embedding_with_rate_limit(chunks)
|
| 76 |
+
all_embeddings.extend(embeddings)
|
| 77 |
+
all_texts.extend(chunks)
|
| 78 |
+
logging.info(f"Processed page {page_num + 1}/{total_pages}, extracted {len(chunks)} chunks.")
|
| 79 |
+
else:
|
| 80 |
+
logging.warning(f"No text found on page {page_num + 1}.")
|
| 81 |
+
elif file_path.endswith(('.mp3', '.wav', '.m4a')):
|
| 82 |
+
logging.info(f"Processing audio file: {file_path}")
|
| 83 |
+
with open(file_path, 'rb') as audio_file:
|
| 84 |
+
audio_content = audio_file.read()
|
| 85 |
+
response = asyncio.run(deepgram.transcription.prerecorded({'buffer': audio_content, 'mimetype': 'audio/wav'}, {'punctuate': True}))
|
| 86 |
+
text = response['results']['channels'][0]['alternatives'][0]['transcript']
|
| 87 |
+
chunks = split_text_into_chunks(text, chunk_size, overlap)
|
| 88 |
+
all_embeddings = get_text_embedding_with_rate_limit(chunks)
|
| 89 |
+
all_texts.extend(chunks)
|
| 90 |
+
elif file_path.endswith(('.mp4', '.avi', '.mov')):
|
| 91 |
+
logging.info(f"Processing video file: {file_path}")
|
| 92 |
+
video = mp.VideoFileClip(file_path)
|
| 93 |
+
audio_path = "temp_audio.wav"
|
| 94 |
+
video.audio.write_audiofile(audio_path)
|
| 95 |
+
with open(audio_path, 'rb') as audio_file:
|
| 96 |
+
audio_content = audio_file.read()
|
| 97 |
+
response = asyncio.run(deepgram.transcription.prerecorded({'buffer': audio_content, 'mimetype': 'audio/wav'}, {'punctuate': True}))
|
| 98 |
+
text = response['results']['channels'][0]['alternatives'][0]['transcript']
|
| 99 |
+
os.remove(audio_path)
|
| 100 |
+
chunks = split_text_into_chunks(text, chunk_size, overlap)
|
| 101 |
+
all_embeddings = get_text_embedding_with_rate_limit(chunks)
|
| 102 |
+
all_texts.extend(chunks)
|
| 103 |
+
else:
|
| 104 |
+
raise ValueError("Unsupported file format. Please upload a PDF, DOC, DOCX, MP3, WAV, M4A, MP4, AVI, or MOV file.")
|
| 105 |
|
| 106 |
embeddings_np = np.array(all_embeddings).astype('float32')
|
| 107 |
with open(vector_db_path, "wb") as f:
|
|
|
|
| 357 |
selected_reranking_methods=selected_reranking_methods_list
|
| 358 |
))
|
| 359 |
|
| 360 |
+
formatted_response = f"# **ChanceRAG Response:**\n\n{response}\n\n"
|
| 361 |
formatted_response += "Retrieved and Reranked Documents:\n"
|
| 362 |
for idx, doc_info in enumerate(source_info, start=1):
|
| 363 |
formatted_response += f"\nDocument {idx}:\n"
|
|
|
|
| 365 |
formatted_response += f"Retrieval Method: {doc_info['method']}\n"
|
| 366 |
if 'score' in doc_info:
|
| 367 |
formatted_response += f"Precision Score: {doc_info['score']:.4f}\n"
|
|
|
|
| 368 |
return formatted_response
|
| 369 |
|
| 370 |
iface = gr.Blocks(theme="Rabbitt-AI/ChanceRAG")
|
|
|
|
| 380 |
"Detailed", "Concise", "Creative", "Technical"], label="Response Style"
|
| 381 |
),
|
| 382 |
],
|
| 383 |
+
outputs= gr.Markdown(value="# **ChanceRAG Response**"),
|
| 384 |
)
|
| 385 |
|
| 386 |
iface.launch(share=True)
|