Spaces:

Ultronprime
/

Emails2go

Build error

App Files Files Community

Ultronprime commited on Feb 4, 2025

Commit

bb01969

verified ·

1 Parent(s): 09c1ee0

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -34

app.py CHANGED Viewed

@@ -45,7 +45,7 @@ os.makedirs(NPY_CACHE, exist_ok=True, mode=0o777)
 LOG_DIR = os.getenv("LOG_DIR", os.path.join(PERSISTENT_PATH, "logs"))
 os.makedirs(LOG_DIR, exist_ok=True, mode=0o777)
-# Set Hugging Face cache directory to PERSISTENT_PATH
 os.environ["HF_HOME"] = os.path.join(PERSISTENT_PATH, ".huggingface")
 os.makedirs(os.environ["HF_HOME"], exist_ok=True, mode=0o777)
@@ -64,7 +64,9 @@ def initialize_model():
     global model
     try:
         if model is None:
-            model = SentenceTransformer(EMBEDDING_MODEL_NAME, cache_folder=os.path.join(PERSISTENT_PATH, "models"))
             logger.info(f"Initialized model: {EMBEDDING_MODEL_NAME}")
         return True
     except requests.exceptions.ConnectionError as e:
@@ -78,6 +80,7 @@ def initialize_model():
 def handle_gpu_operation(func):
     try:
         start_time = datetime.now()
         with autocast(device_type='cuda', dtype=torch.float16):
             result = func()
         end_time = datetime.now()
@@ -121,7 +124,7 @@ def process_files(files):
         valid_files = [f for f in files if f.name.lower().endswith('.txt')]
         if not valid_files:
-            return "No .txt files found in upload. Please ensure you upload .txt files.", "", ""
         all_chunks = []
         processed_files = 0
@@ -133,6 +136,7 @@ def process_files(files):
                     detected_encoding = from_bytes(content).best().encoding
                     decoded_content = content.decode(detected_encoding, errors='ignore')
                 chunks = [decoded_content[i:i+CHUNK_SIZE] for i in range(0, len(decoded_content), CHUNK_SIZE)]
                 all_chunks.extend(chunks)
                 processed_files += 1
@@ -141,7 +145,7 @@ def process_files(files):
                 logger.error(f"Error processing file {file.name}: {str(e)}")
         if not all_chunks:
-            return "No valid content found in the uploaded .txt files.", "", ""
         # Generate embeddings in batches
         all_embeddings = []
@@ -156,7 +160,6 @@ def process_files(files):
         # Save results to OUTPUTS_DIR
         embeddings_path = os.path.join(OUTPUTS_DIR, "embeddings.npy")
         np.save(embeddings_path, np.array(all_embeddings))
         chunks_path = os.path.join(OUTPUTS_DIR, "chunks.txt")
         with open(chunks_path, "w", encoding="utf-8") as f:
             for chunk in all_chunks:
@@ -179,19 +182,16 @@ def semantic_search(query, top_k=5):
         return "Model not initialized. Please process files first."
     try:
-        # Load saved embeddings from OUTPUTS_DIR
-        stored_embeddings = np.load(os.path.join(OUTPUTS_DIR, "embeddings.npy"))
-        # Load stored chunks from OUTPUTS_DIR
-        with open(os.path.join(OUTPUTS_DIR, "chunks.txt"), "r", encoding="utf-8") as f:
             chunks = f.read().split("\n===CHUNK_SEPARATOR===\n")
             chunks = [c for c in chunks if c.strip()]
         # Get query embedding
-        if model:
-            query_embedding = model.encode([query])[0]
-        else:
-            return "Model not initialized. Please process files first."
         # Calculate similarities
         similarities = np.dot(stored_embeddings, query_embedding) / (
@@ -200,8 +200,6 @@ def semantic_search(query, top_k=5):
         # Get top results
         top_indices = np.argsort(similarities)[-top_k:][::-1]
-        # Format results
         results = []
         for idx in top_indices:
             results.append(f"""
@@ -209,9 +207,7 @@ Similarity: {similarities[idx]:.3f}
 Content: {chunks[idx]}
 -------------------
 """)
         return "\n".join(results)
     except Exception as e:
         logger.error(f"Search error: {str(e)}")
         return f"Search error occurred: {str(e)}"
@@ -223,11 +219,12 @@ def search_and_format(query, num_results):
 def browse_outputs():
     try:
         webbrowser.open(f"file://{OUTPUTS_DIR}")
-        return "Opened outputs directory"
     except Exception as e:
         logger.error(f"Error opening file browser: {str(e)}")
-        return "Error opening file browser"
 def download_results():
     required_files = ["embeddings.npy", "chunks.txt"]
@@ -235,15 +232,15 @@ def download_results():
     if missing:
         logger.error(f"Missing files: {missing}")
         return None
     try:
         zip_path = os.path.join(OUTPUTS_DIR, "results.zip")
         with zipfile.ZipFile(zip_path, 'w') as zipf:
             for file in required_files:
-                zipf.write(os.path.join(OUTPUTS_DIR, file), file)
         return zip_path
     except Exception as e:
-        logger.error(f"Error creating download: {str(e)}")
         return None
 def create_gradio_interface():
@@ -262,12 +259,18 @@ def create_gradio_interface():
         process_button = gr.Button("Generate Embeddings")
         output_text = gr.Textbox(label="Status")
         with gr.Tab("Search"):
             query_input = gr.Textbox(
                 label="Enter your search query",
                 placeholder="Enter text to search through your documents..."
             )
-            top_k = gr.Slider(
                 minimum=1,
                 maximum=20,
                 value=5,
@@ -280,10 +283,9 @@ def create_gradio_interface():
                 lines=10,
                 show_copy_button=True
             )
             search_button.click(
                 fn=search_and_format,
-                inputs=[query_input, top_k],
                 outputs=results_output
             )
@@ -297,17 +299,11 @@ def create_gradio_interface():
             browse_button = gr.Button("📁 Browse Outputs")
             browse_button.click(
                 fn=browse_outputs,
-                outputs=None
             )
-        process_button.click(
-            process_files,
-            inputs=[file_input],
-            outputs=[output_text, error_box, error_box]
-        )
     return demo
 if __name__ == "__main__":
     demo = create_gradio_interface()
-    demo.launch(server_name="0.0.0.0")

 LOG_DIR = os.getenv("LOG_DIR", os.path.join(PERSISTENT_PATH, "logs"))
 os.makedirs(LOG_DIR, exist_ok=True, mode=0o777)
+# Set Hugging Face cache directory to persistent storage
 os.environ["HF_HOME"] = os.path.join(PERSISTENT_PATH, ".huggingface")
 os.makedirs(os.environ["HF_HOME"], exist_ok=True, mode=0o777)
     global model
     try:
         if model is None:
+            model_cache = os.path.join(PERSISTENT_PATH, "models")
+            os.makedirs(model_cache, exist_ok=True, mode=0o777)
+            model = SentenceTransformer(EMBEDDING_MODEL_NAME, cache_folder=model_cache)
             logger.info(f"Initialized model: {EMBEDDING_MODEL_NAME}")
         return True
     except requests.exceptions.ConnectionError as e:
 def handle_gpu_operation(func):
     try:
         start_time = datetime.now()
+        # Updated autocast usage as per deprecation notice
         with autocast(device_type='cuda', dtype=torch.float16):
             result = func()
         end_time = datetime.now()
         valid_files = [f for f in files if f.name.lower().endswith('.txt')]
         if not valid_files:
+            return "No .txt files found. Please upload valid .txt files.", "", ""
         all_chunks = []
         processed_files = 0
                     detected_encoding = from_bytes(content).best().encoding
                     decoded_content = content.decode(detected_encoding, errors='ignore')
+                # Split content into chunks
                 chunks = [decoded_content[i:i+CHUNK_SIZE] for i in range(0, len(decoded_content), CHUNK_SIZE)]
                 all_chunks.extend(chunks)
                 processed_files += 1
                 logger.error(f"Error processing file {file.name}: {str(e)}")
         if not all_chunks:
+            return "No valid content found in the uploaded files.", "", ""
         # Generate embeddings in batches
         all_embeddings = []
         # Save results to OUTPUTS_DIR
         embeddings_path = os.path.join(OUTPUTS_DIR, "embeddings.npy")
         np.save(embeddings_path, np.array(all_embeddings))
         chunks_path = os.path.join(OUTPUTS_DIR, "chunks.txt")
         with open(chunks_path, "w", encoding="utf-8") as f:
             for chunk in all_chunks:
         return "Model not initialized. Please process files first."
     try:
+        # Load saved embeddings and chunks from OUTPUTS_DIR
+        embeddings_file = os.path.join(OUTPUTS_DIR, "embeddings.npy")
+        chunks_file = os.path.join(OUTPUTS_DIR, "chunks.txt")
+        stored_embeddings = np.load(embeddings_file)
+        with open(chunks_file, "r", encoding="utf-8") as f:
             chunks = f.read().split("\n===CHUNK_SEPARATOR===\n")
             chunks = [c for c in chunks if c.strip()]
         # Get query embedding
+        query_embedding = model.encode([query])[0]
         # Calculate similarities
         similarities = np.dot(stored_embeddings, query_embedding) / (
         # Get top results
         top_indices = np.argsort(similarities)[-top_k:][::-1]
         results = []
         for idx in top_indices:
             results.append(f"""
 Content: {chunks[idx]}
 -------------------
 """)
         return "\n".join(results)
     except Exception as e:
         logger.error(f"Search error: {str(e)}")
         return f"Search error occurred: {str(e)}"
 def browse_outputs():
     try:
+        # Open the outputs directory in a web browser (may work on some systems)
         webbrowser.open(f"file://{OUTPUTS_DIR}")
+        return "Opened outputs directory."
     except Exception as e:
         logger.error(f"Error opening file browser: {str(e)}")
+        return "Error opening file browser."
 def download_results():
     required_files = ["embeddings.npy", "chunks.txt"]
     if missing:
         logger.error(f"Missing files: {missing}")
         return None
     try:
         zip_path = os.path.join(OUTPUTS_DIR, "results.zip")
         with zipfile.ZipFile(zip_path, 'w') as zipf:
             for file in required_files:
+                file_path = os.path.join(OUTPUTS_DIR, file)
+                zipf.write(file_path, file)
         return zip_path
     except Exception as e:
+        logger.error(f"Error creating download archive: {str(e)}")
         return None
 def create_gradio_interface():
         process_button = gr.Button("Generate Embeddings")
         output_text = gr.Textbox(label="Status")
+        process_button.click(
+            fn=process_files,
+            inputs=[file_input],
+            outputs=[output_text, error_box, error_box]
+        )
         with gr.Tab("Search"):
             query_input = gr.Textbox(
                 label="Enter your search query",
                 placeholder="Enter text to search through your documents..."
             )
+            top_k_slider = gr.Slider(
                 minimum=1,
                 maximum=20,
                 value=5,
                 lines=10,
                 show_copy_button=True
             )
             search_button.click(
                 fn=search_and_format,
+                inputs=[query_input, top_k_slider],
                 outputs=results_output
             )
             browse_button = gr.Button("📁 Browse Outputs")
             browse_button.click(
                 fn=browse_outputs,
+                outputs=[gr.Textbox(label="Browse Status")]
             )
     return demo
 if __name__ == "__main__":
     demo = create_gradio_interface()
+    demo.launch(server_name="0.0.0.0")