Spaces:

Basu03
/

personal_excel_interviewer

Sleeping

App Files Files Community

Basu03 commited on Aug 1, 2025

Commit

2d12dcf

1 Parent(s): 31d912a

vertex ai minor bugs 4

Browse files

Files changed (3) hide show

.DS_Store +0 -0
requirements.txt +14 -3
src/local_llm_handler.py +11 -5

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

requirements.txt CHANGED Viewed

@@ -1,5 +1,16 @@
-gradio>=5.0.0
-transformers>=4.41.0
 torch
 langgraph
-accelerate

+# Pin Gradio to a known stable version
+gradio==4.29.0
+# Pinning transformers and accelerate to fix the 'seen_tokens' bug with Phi-3
+transformers==4.42.3
+accelerate==0.32.1
+# These are often required dependencies for attention mechanisms and can help stability
+einops
 torch
+# LangGraph itself
 langgraph
+# Optional but recommended by the Phi-3 model card for GPU performance
+flash-attn

src/local_llm_handler.py CHANGED Viewed

@@ -11,7 +11,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
 def load_llm_pipeline():
     """
     Loads and caches the local LLM pipeline using Phi-3-mini-4k-instruct.
-    Designed for Hugging Face Spaces (with upgraded CPU or T4 GPU).
     """
     print("--- Loading main LLM: microsoft/Phi-3-mini-4k-instruct ---")
     model_name = "microsoft/phi-3-mini-4k-instruct"
@@ -21,17 +20,19 @@ def load_llm_pipeline():
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         device_map="auto",
-        torch_dtype=torch.float32,
         trust_remote_code=True
     )
     # Build text generation pipeline
     llm_pipeline = pipeline(
         "text-generation",
         model=model,
         tokenizer=tokenizer,
         max_new_tokens=300,
-        return_full_text=False
     )
     print("--- Phi-3-mini model loaded successfully ---")
@@ -42,7 +43,12 @@ def get_llm_response(prompt: str) -> str:
     Gets a response from the cached Phi-3-mini LLM pipeline.
     """
     llm_pipeline = load_llm_pipeline()
-    formatted_prompt = f"<|user|>\n{prompt}\n<|assistant|>"
     print("AI: (Generating response with Phi-3-mini...)")
     try:
@@ -51,4 +57,4 @@ def get_llm_response(prompt: str) -> str:
         return response
     except Exception as e:
         print(f"Error during Phi-3-mini generation: {e}")
-        return "Sorry, I encountered an error while generating a response."

 def load_llm_pipeline():
     """
     Loads and caches the local LLM pipeline using Phi-3-mini-4k-instruct.
     """
     print("--- Loading main LLM: microsoft/Phi-3-mini-4k-instruct ---")
     model_name = "microsoft/phi-3-mini-4k-instruct"
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
         device_map="auto",
+        torch_dtype="auto",  # Use "auto" for better compatibility
         trust_remote_code=True
     )
     # Build text generation pipeline
+    # CORRECTED: Added eos_token_id for cleaner, more reliable generation
     llm_pipeline = pipeline(
         "text-generation",
         model=model,
         tokenizer=tokenizer,
         max_new_tokens=300,
+        return_full_text=False,
+        eos_token_id=tokenizer.eos_token_id # Crucial for stopping generation
     )
     print("--- Phi-3-mini model loaded successfully ---")
     Gets a response from the cached Phi-3-mini LLM pipeline.
     """
     llm_pipeline = load_llm_pipeline()
+    # Phi-3 uses a specific chat template format
+    messages = [
+        {"role": "user", "content": prompt},
+    ]
+    # Use the tokenizer's built-in chat template for the most reliable formatting
+    formatted_prompt = llm_pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     print("AI: (Generating response with Phi-3-mini...)")
     try:
         return response
     except Exception as e:
         print(f"Error during Phi-3-mini generation: {e}")
+        return "Sorry, I encountered an error while generating a response."