Spaces:

frankai98
/

Tokentesting

Sleeping

App Files Files Community

frankai98 commited on Mar 13, 2025

Commit

68286e4

verified ·

1 Parent(s): fa9cb53

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -18

app.py CHANGED Viewed

@@ -8,6 +8,9 @@ from streamlit.components.v1 import html
 import pandas as pd
 import torch
 import random
 # Retrieve the token from environment variables
 hf_token = os.environ.get("HF_TOKEN")
@@ -50,12 +53,26 @@ st.header("Review Scorer & Report Generator")
 # Concise introduction
 st.write("This model will score your reviews in your CSV file and generate a report based on your query and those results.")
 # Load models with caching to avoid reloading on every run
 @st.cache_resource
 def load_models():
     llama_pipe = None
     score_pipe = None
-    gemma_pipe = None
     try:
         st.info("Loading Llama 3.2 summarization model...")
@@ -64,6 +81,14 @@ def load_models():
                                      device=0,  # Use GPU if available
                                      torch_dtype=torch.bfloat16,)  # Use FP16 for efficiency
         st.success("Llama 3.2 summarization model loaded successfully!")
     except Exception as e:
         st.error(f"Error loading Llama 3.2 summarization model: {e}")
         st.error(f"Detailed error: {type(e).__name__}: {str(e)}")
@@ -74,26 +99,21 @@ def load_models():
                               model="cardiffnlp/twitter-roberta-base-sentiment-latest",
                               device=0 if torch.cuda.is_available() else -1)
         st.success("Sentiment analysis model loaded successfully!")
     except Exception as e:
         st.error(f"Error loading sentiment analysis model: {e}")
-    try:
-        st.info("Loading Gemma model...")
-        # Load the tokenizer separately with the chat template
-        tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
-        gemma_pipe = pipeline("text-generation",
-                              model="google/gemma-3-1b-it",
-                              tokenizer=tokenizer,  # Pass the loaded tokenizer here
-                              device=0,
-                              torch_dtype=torch.bfloat16)
-        st.success("Gemma model loaded successfully!")
-    except Exception as e:
-        st.error(f"Error loading Gemma model: {e}")
-        st.error(f"Detailed error: {type(e).__name__}: {str(e)}")
-    return llama_pipe, score_pipe, gemma_pipe
-llama_pipe, score_pipe, gemma_pipe = load_models()
 def extract_assistant_content(raw_response):
     """Extract only the assistant's content from the Gemma-3 response."""
@@ -225,12 +245,27 @@ else:
             # Stage 2: Generate Report using Gemma in the new messages format.
             status_text.markdown("**📝 Generating report with Gemma...**")
-            # Clear CUDA cache before using Gemma to free up memory
             torch.cuda.empty_cache()
             # Sample or summarize the data for Gemma to avoid memory issues
             import random
-            max_reviews = 100  # Adjust based on your GPU memory
             if len(scored_docs) > max_reviews:
                 sampled_docs = random.sample(scored_docs, max_reviews)
                 st.info(f"Sampling {max_reviews} out of {len(scored_docs)} reviews for report generation")

 import pandas as pd
 import torch
 import random
+import gc
+import time
+from threading import Thread
 # Retrieve the token from environment variables
 hf_token = os.environ.get("HF_TOKEN")
 # Concise introduction
 st.write("This model will score your reviews in your CSV file and generate a report based on your query and those results.")
+def display_temp_message(message, message_type="info", duration=5):
+    """Display a temporary message that disappears after specified duration."""
+    placeholder = st.empty()
+    if message_type == "info":
+        placeholder.info(message)
+    elif message_type == "success":
+        placeholder.success(message)
+    elif message_type == "error":
+        placeholder.error(message)
+    # Wait and then clear the message
+    time.sleep(duration)
+    placeholder.empty()
 # Load models with caching to avoid reloading on every run
 @st.cache_resource
 def load_models():
     llama_pipe = None
     score_pipe = None
     try:
         st.info("Loading Llama 3.2 summarization model...")
                                      device=0,  # Use GPU if available
                                      torch_dtype=torch.bfloat16,)  # Use FP16 for efficiency
         st.success("Llama 3.2 summarization model loaded successfully!")
+        # Display success message that will disappear after 5 seconds
+        Thread(
+            target=display_temp_message,
+            args=("Llama 3.2 summarization model loaded successfully!", "success"),
+            daemon=True
+        ).start()
     except Exception as e:
         st.error(f"Error loading Llama 3.2 summarization model: {e}")
         st.error(f"Detailed error: {type(e).__name__}: {str(e)}")
                               model="cardiffnlp/twitter-roberta-base-sentiment-latest",
                               device=0 if torch.cuda.is_available() else -1)
         st.success("Sentiment analysis model loaded successfully!")
+        # Display success message that will disappear after 5 seconds
+        Thread(
+            target=display_temp_message,
+            args=("Sentiment analysis model loaded successfully!", "success"),
+            daemon=True
+        ).start()
     except Exception as e:
         st.error(f"Error loading sentiment analysis model: {e}")
+    return llama_pipe, score_pipe
+llama_pipe, score_pipe = load_models()
 def extract_assistant_content(raw_response):
     """Extract only the assistant's content from the Gemma-3 response."""
             # Stage 2: Generate Report using Gemma in the new messages format.
             status_text.markdown("**📝 Generating report with Gemma...**")
+            # After using score_pipe
+            del score_pipe
+            gc.collect()
             torch.cuda.empty_cache()
+            # After using summarization_pipe
+            del llama_pipe
+            gc.collect()
+            torch.cuda.empty_cache()
+            # Then reload Gemma specifically for the final step
+            tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
+            gemma_pipe = pipeline("text-generation",
+                                  model="google/gemma-3-1b-it",
+                                  tokenizer=tokenizer,
+                                  device=0,
+                                  torch_dtype=torch.bfloat16)
             # Sample or summarize the data for Gemma to avoid memory issues
             import random
+            max_reviews = 50  # Adjust based on your GPU memory
             if len(scored_docs) > max_reviews:
                 sampled_docs = random.sample(scored_docs, max_reviews)
                 st.info(f"Sampling {max_reviews} out of {len(scored_docs)} reviews for report generation")