Basu03 commited on
Commit
2d12dcf
·
1 Parent(s): 31d912a

vertex ai minor bugs 4

Browse files
Files changed (3) hide show
  1. .DS_Store +0 -0
  2. requirements.txt +14 -3
  3. src/local_llm_handler.py +11 -5
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
requirements.txt CHANGED
@@ -1,5 +1,16 @@
1
- gradio>=5.0.0
2
- transformers>=4.41.0
 
 
 
 
 
 
 
3
  torch
 
 
4
  langgraph
5
- accelerate
 
 
 
1
+ # Pin Gradio to a known stable version
2
+ gradio==4.29.0
3
+
4
+ # Pinning transformers and accelerate to fix the 'seen_tokens' bug with Phi-3
5
+ transformers==4.42.3
6
+ accelerate==0.32.1
7
+
8
+ # These are often required dependencies for attention mechanisms and can help stability
9
+ einops
10
  torch
11
+
12
+ # LangGraph itself
13
  langgraph
14
+
15
+ # Optional but recommended by the Phi-3 model card for GPU performance
16
+ flash-attn
src/local_llm_handler.py CHANGED
@@ -11,7 +11,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
11
  def load_llm_pipeline():
12
  """
13
  Loads and caches the local LLM pipeline using Phi-3-mini-4k-instruct.
14
- Designed for Hugging Face Spaces (with upgraded CPU or T4 GPU).
15
  """
16
  print("--- Loading main LLM: microsoft/Phi-3-mini-4k-instruct ---")
17
  model_name = "microsoft/phi-3-mini-4k-instruct"
@@ -21,17 +20,19 @@ def load_llm_pipeline():
21
  model = AutoModelForCausalLM.from_pretrained(
22
  model_name,
23
  device_map="auto",
24
- torch_dtype=torch.float32,
25
  trust_remote_code=True
26
  )
27
 
28
  # Build text generation pipeline
 
29
  llm_pipeline = pipeline(
30
  "text-generation",
31
  model=model,
32
  tokenizer=tokenizer,
33
  max_new_tokens=300,
34
- return_full_text=False
 
35
  )
36
 
37
  print("--- Phi-3-mini model loaded successfully ---")
@@ -42,7 +43,12 @@ def get_llm_response(prompt: str) -> str:
42
  Gets a response from the cached Phi-3-mini LLM pipeline.
43
  """
44
  llm_pipeline = load_llm_pipeline()
45
- formatted_prompt = f"<|user|>\n{prompt}\n<|assistant|>"
 
 
 
 
 
46
 
47
  print("AI: (Generating response with Phi-3-mini...)")
48
  try:
@@ -51,4 +57,4 @@ def get_llm_response(prompt: str) -> str:
51
  return response
52
  except Exception as e:
53
  print(f"Error during Phi-3-mini generation: {e}")
54
- return "Sorry, I encountered an error while generating a response."
 
11
  def load_llm_pipeline():
12
  """
13
  Loads and caches the local LLM pipeline using Phi-3-mini-4k-instruct.
 
14
  """
15
  print("--- Loading main LLM: microsoft/Phi-3-mini-4k-instruct ---")
16
  model_name = "microsoft/phi-3-mini-4k-instruct"
 
20
  model = AutoModelForCausalLM.from_pretrained(
21
  model_name,
22
  device_map="auto",
23
+ torch_dtype="auto", # Use "auto" for better compatibility
24
  trust_remote_code=True
25
  )
26
 
27
  # Build text generation pipeline
28
+ # CORRECTED: Added eos_token_id for cleaner, more reliable generation
29
  llm_pipeline = pipeline(
30
  "text-generation",
31
  model=model,
32
  tokenizer=tokenizer,
33
  max_new_tokens=300,
34
+ return_full_text=False,
35
+ eos_token_id=tokenizer.eos_token_id # Crucial for stopping generation
36
  )
37
 
38
  print("--- Phi-3-mini model loaded successfully ---")
 
43
  Gets a response from the cached Phi-3-mini LLM pipeline.
44
  """
45
  llm_pipeline = load_llm_pipeline()
46
+ # Phi-3 uses a specific chat template format
47
+ messages = [
48
+ {"role": "user", "content": prompt},
49
+ ]
50
+ # Use the tokenizer's built-in chat template for the most reliable formatting
51
+ formatted_prompt = llm_pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
52
 
53
  print("AI: (Generating response with Phi-3-mini...)")
54
  try:
 
57
  return response
58
  except Exception as e:
59
  print(f"Error during Phi-3-mini generation: {e}")
60
+ return "Sorry, I encountered an error while generating a response."