Spaces:

Rabbitt-AI
/

ChanceRAG

Running

App Files Files Community

Rabbitt-AI commited on Oct 9, 2024

Commit

b512d5e

verified ·

1 Parent(s): b1e1d68

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -21

app.py CHANGED Viewed

@@ -15,13 +15,18 @@ from rank_bm25 import BM25Okapi
 from gensim.models import Word2Vec
 from typing import List, Optional, Tuple
 import gradio as gr
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 api_key = os.getenv("MISTRAL_API_KEY")
 client = Mistral(api_key=api_key)
 def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10, max_delay=60):
     embeddings = []
     for text in text_list:
@@ -48,30 +53,55 @@ def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=1
     return embeddings
 def store_embeddings_in_vector_db(
-    pdf_path: str,
     vector_db_path: str,
     annoy_index_path: str,
     chunk_size: int = 2048,
     overlap: int = 200,
     num_trees: int = 10
 ):
-    doc = fitz.open(pdf_path)
-    all_embeddings = []
     all_texts = []
-    total_pages = doc.page_count
-    logging.info(f"Processing PDF: {pdf_path} with {total_pages} pages.")
-    for page_num in range(total_pages):
-        page = doc.load_page(page_num)
-        text = page.get_text()
-        if text.strip():
-            chunks = split_text_into_chunks(text, chunk_size, overlap)
-            embeddings = get_text_embedding_with_rate_limit(chunks)
-            all_embeddings.extend(embeddings)
-            all_texts.extend(chunks)
-            logging.info(f"Processed page {page_num + 1}/{total_pages}, extracted {len(chunks)} chunks.")
-        else:
-            logging.warning(f"No text found on page {page_num + 1}.")
     embeddings_np = np.array(all_embeddings).astype('float32')
     with open(vector_db_path, "wb") as f:
@@ -327,7 +357,7 @@ def chatbot_interface(file, user_query, response_style):
         selected_reranking_methods=selected_reranking_methods_list
     ))
-    formatted_response = f"Response:\n{response}\n\n"
     formatted_response += "Retrieved and Reranked Documents:\n"
     for idx, doc_info in enumerate(source_info, start=1):
         formatted_response += f"\nDocument {idx}:\n"
@@ -335,7 +365,6 @@ def chatbot_interface(file, user_query, response_style):
         formatted_response += f"Retrieval Method: {doc_info['method']}\n"
         if 'score' in doc_info:
             formatted_response += f"Precision Score: {doc_info['score']:.4f}\n"
     return formatted_response
 iface = gr.Blocks(theme="Rabbitt-AI/ChanceRAG")
@@ -351,7 +380,7 @@ with iface:
                 "Detailed", "Concise", "Creative", "Technical"], label="Response Style"
             ),
         ],
-        outputs= gr.Textbox(label="ChanceRAG Response"),
     )
 iface.launch(share=True)

 from gensim.models import Word2Vec
 from typing import List, Optional, Tuple
 import gradio as gr
+import moviepy.editor as mp
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 api_key = os.getenv("MISTRAL_API_KEY")
 client = Mistral(api_key=api_key)
+from deepgram import Deepgram
+dg_api_key = os.getenv("DEEPGRAM_API_KEY")
+deepgram = Deepgram(dg_api_key)
 def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10, max_delay=60):
     embeddings = []
     for text in text_list:
     return embeddings
 def store_embeddings_in_vector_db(
+    file_path: str,
     vector_db_path: str,
     annoy_index_path: str,
     chunk_size: int = 2048,
     overlap: int = 200,
     num_trees: int = 10
 ):
     all_texts = []
+    if file_path.endswith(('.pdf', '.doc', '.docx' , '.pptx' , '.ppt' , '.xls', '.xlsx' , '.txt' )):
+        doc = fitz.open(file_path)
+        all_embeddings = []
+        total_pages = doc.page_count
+        logging.info(f"Processing PDF/DOC: {file_path} with {total_pages} pages.")
+        for page_num in range(total_pages):
+            page = doc.load_page(page_num)
+            text = page.get_text()
+            if text.strip():
+                chunks = split_text_into_chunks(text, chunk_size, overlap)
+                embeddings = get_text_embedding_with_rate_limit(chunks)
+                all_embeddings.extend(embeddings)
+                all_texts.extend(chunks)
+                logging.info(f"Processed page {page_num + 1}/{total_pages}, extracted {len(chunks)} chunks.")
+            else:
+                logging.warning(f"No text found on page {page_num + 1}.")
+    elif file_path.endswith(('.mp3', '.wav', '.m4a')):
+        logging.info(f"Processing audio file: {file_path}")
+        with open(file_path, 'rb') as audio_file:
+            audio_content = audio_file.read()
+            response = asyncio.run(deepgram.transcription.prerecorded({'buffer': audio_content, 'mimetype': 'audio/wav'}, {'punctuate': True}))
+        text = response['results']['channels'][0]['alternatives'][0]['transcript']
+        chunks = split_text_into_chunks(text, chunk_size, overlap)
+        all_embeddings = get_text_embedding_with_rate_limit(chunks)
+        all_texts.extend(chunks)
+    elif file_path.endswith(('.mp4', '.avi', '.mov')):
+        logging.info(f"Processing video file: {file_path}")
+        video = mp.VideoFileClip(file_path)
+        audio_path = "temp_audio.wav"
+        video.audio.write_audiofile(audio_path)
+        with open(audio_path, 'rb') as audio_file:
+            audio_content = audio_file.read()
+            response = asyncio.run(deepgram.transcription.prerecorded({'buffer': audio_content, 'mimetype': 'audio/wav'}, {'punctuate': True}))
+        text = response['results']['channels'][0]['alternatives'][0]['transcript']
+        os.remove(audio_path)
+        chunks = split_text_into_chunks(text, chunk_size, overlap)
+        all_embeddings = get_text_embedding_with_rate_limit(chunks)
+        all_texts.extend(chunks)
+    else:
+        raise ValueError("Unsupported file format. Please upload a PDF, DOC, DOCX, MP3, WAV, M4A, MP4, AVI, or MOV file.")
     embeddings_np = np.array(all_embeddings).astype('float32')
     with open(vector_db_path, "wb") as f:
         selected_reranking_methods=selected_reranking_methods_list
     ))
+    formatted_response = f"# **ChanceRAG Response:**\n\n{response}\n\n"
     formatted_response += "Retrieved and Reranked Documents:\n"
     for idx, doc_info in enumerate(source_info, start=1):
         formatted_response += f"\nDocument {idx}:\n"
         formatted_response += f"Retrieval Method: {doc_info['method']}\n"
         if 'score' in doc_info:
             formatted_response += f"Precision Score: {doc_info['score']:.4f}\n"
     return formatted_response
 iface = gr.Blocks(theme="Rabbitt-AI/ChanceRAG")
                 "Detailed", "Concise", "Creative", "Technical"], label="Response Style"
             ),
         ],
+        outputs= gr.Markdown(value="# **ChanceRAG Response**"),
     )
 iface.launch(share=True)