hamxaameer commited on
Commit
25c4058
Β·
verified Β·
1 Parent(s): 3c74f4e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -74
app.py CHANGED
@@ -36,18 +36,12 @@ CONFIG = {
36
  "max_tokens": 350,
37
  }
38
 
39
- # Remote inference config (optional). If `HF_INFERENCE_API_KEY` is set in the
40
- # environment, the app will prefer calling the Hugging Face Inference API (remote
41
- # hosted model) which can generate longer outputs faster than a CPU-bound local
42
- # model. Set `HF_INFERENCE_MODEL` to choose the remote model (instruction-tuned
43
- # model recommended).
44
- #
45
- # PHI models are excellent lightweight instruction-following models:
46
- # - microsoft/phi-2 (2.7B parameters, free inference)
47
- # - microsoft/Phi-3-mini-4k-instruct (3.8B parameters, recommended)
48
- # - microsoft/Phi-3-mini-128k-instruct (3.8B with longer context)
49
  USE_REMOTE_LLM = False
50
- REMOTE_LLM_MODEL = os.environ.get("HF_INFERENCE_MODEL", "microsoft/Phi-3-mini-4k-instruct")
51
 
52
  # Prefer the environment variable, but also allow a local token file for users
53
  # who don't know how to set env vars. Create a file named `hf_token.txt` in the
@@ -71,44 +65,92 @@ if HF_INFERENCE_API_KEY:
71
  # ============================================================================
72
 
73
  def initialize_llm():
74
- # If a remote HF Inference API key is provided, we won't instantiate a local
75
- # heavy model; instead generation will be performed via the HTTP API.
76
- global USE_REMOTE_LLM, REMOTE_LLM_MODEL
77
- # For Hugging Face Spaces deployment: prefer remote PHI inference
78
- # This avoids memory issues on CPU-only spaces and provides better performance
79
- if USE_REMOTE_LLM:
80
- logger.info(f"πŸ”„ Using remote Hugging Face Inference with PHI model: {REMOTE_LLM_MODEL}")
81
- logger.info(f" βœ… PHI models are optimized for instruction-following and long-form generation")
82
- CONFIG["llm_model"] = REMOTE_LLM_MODEL
83
- CONFIG["model_type"] = "remote_phi"
84
- return None
85
-
86
- # Final fallback: attempt to initialize the free local T5 model (as before)
87
- logger.info("πŸ”„ Initializing FREE local language model (fallback to T5)...")
88
- model_name = "google/flan-t5-large"
89
-
90
  try:
91
- logger.info(f" Loading {model_name}...")
92
- device = 0 if torch.cuda.is_available() else -1
93
-
94
- model_kwargs = {"low_cpu_mem_usage": True}
95
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  llm_client = pipeline(
97
- "text2text-generation",
98
- model=model_name,
99
- device=device,
100
- model_kwargs=model_kwargs
 
 
101
  )
102
-
103
- CONFIG["llm_model"] = model_name
104
- CONFIG["model_type"] = "t5"
105
- logger.info(f"βœ… LLM initialized: {model_name}")
106
- logger.info(f" Device: {'GPU' if device == 0 else 'CPU'}")
 
 
 
107
  return llm_client
108
-
 
 
 
 
109
  except Exception as e:
110
- logger.error(f"❌ Failed to load model: {str(e)}")
111
- raise Exception(f"Failed to initialize LLM: {str(e)}")
 
 
112
 
113
 
114
  def remote_generate(prompt: str, max_new_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.9) -> str:
@@ -472,14 +514,30 @@ Draft:
472
  Answer:
473
  """
474
 
475
- logger.info(" β†’ Polishing scaffold with LLM")
476
  try:
477
- if USE_REMOTE_LLM:
478
- polished = remote_generate(polish_prompt, max_new_tokens=600, temperature=0.72, top_p=0.92)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
479
  else:
480
- out = llm_client(polish_prompt, max_new_tokens=600, temperature=0.72, top_p=0.92, do_sample=True, num_beams=1)
481
- polished = out[0].get('generated_text', '') if isinstance(out, list) and out else str(out)
482
  polished = polished.strip()
 
483
  except Exception as e:
484
  logger.error(f" βœ— Polishing error: {e}")
485
  return None
@@ -555,9 +613,9 @@ def generate_llm_answer(
555
  llm_client,
556
  attempt: int = 1
557
  ) -> Optional[str]:
558
- # Allow operation when using remote inference (no local llm_client).
559
- if not llm_client and not USE_REMOTE_LLM:
560
- logger.error(" β†’ LLM client not initialized and remote inference disabled")
561
  return None
562
 
563
  query_lower = query.lower()
@@ -600,27 +658,36 @@ def generate_llm_answer(
600
  max_iterations = 4
601
 
602
  def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
603
- logger.info(f" β†’ Model call (temp={temperature}, max_new_tokens={max_new_tokens})")
604
  try:
605
- if USE_REMOTE_LLM:
606
- # Use remote Hugging Face Inference API
607
- return remote_generate(prompt, max_new_tokens, temperature, top_p)
608
-
609
  out = llm_client(
610
  prompt,
611
  max_new_tokens=max_new_tokens,
612
  temperature=temperature,
613
  top_p=top_p,
614
  do_sample=True,
615
- num_beams=1,
616
  repetition_penalty=repetition_penalty,
617
- early_stopping=False
 
 
618
  )
 
 
619
  if isinstance(out, list) and out:
620
- return out[0].get('generated_text', '') if isinstance(out[0], dict) else str(out[0])
621
- return str(out)
 
 
 
 
 
 
 
 
 
622
  except Exception as e:
623
- logger.error(f" βœ— Model call error: {e}")
624
  return ''
625
 
626
  # Build initial prompt
@@ -771,18 +838,15 @@ def generate_answer_langchain(
771
 
772
  if not llm_answer:
773
  logger.error(f" βœ— All 2 LLM attempts failed")
774
- # Next attempt: if remote LLM is available, build a short scaffold from
775
- # retrieved documents and ask the remote model to polish/expand it. This
776
- # is more reliable than single-shot long generation on some models.
777
- if USE_REMOTE_LLM:
778
- try:
779
- logger.info(" β†’ Attempting scaffold-and-polish using remote LLM")
780
- polished = scaffold_and_polish(query, retrieved_docs, llm_client)
781
- if polished:
782
- logger.info(" βœ… Scaffold-and-polish produced an answer")
783
- return polished
784
- except Exception as e:
785
- logger.error(f" βœ— Scaffold-and-polish error: {e}")
786
 
787
  # Final fallback: extractive templated answer (guaranteed deterministic)
788
  try:
 
36
  "max_tokens": 350,
37
  }
38
 
39
+ # Local PHI model configuration for Hugging Face Spaces
40
+ # PHI-2 is optimal for CPU deployment: 2.7B parameters, excellent quality
41
+ # Can be swapped with Phi-3-mini-4k-instruct if more memory is available
42
+ LOCAL_PHI_MODEL = os.environ.get("LOCAL_PHI_MODEL", "microsoft/phi-2")
43
+ USE_8BIT_QUANTIZATION = True # Reduces memory usage by ~50%
 
 
 
 
 
44
  USE_REMOTE_LLM = False
 
45
 
46
  # Prefer the environment variable, but also allow a local token file for users
47
  # who don't know how to set env vars. Create a file named `hf_token.txt` in the
 
65
  # ============================================================================
66
 
67
  def initialize_llm():
68
+ """Initialize PHI model locally with CPU optimizations for Hugging Face Spaces.
69
+
70
+ Uses efficient techniques:
71
+ - 8-bit quantization to reduce memory by ~50%
72
+ - CPU-optimized loading with device_map
73
+ - Lazy loading and minimal memory footprint
74
+ """
75
+ global LOCAL_PHI_MODEL, USE_8BIT_QUANTIZATION
76
+
77
+ logger.info(f"πŸ”„ Initializing local PHI model: {LOCAL_PHI_MODEL}")
78
+ logger.info(" Using CPU-optimized configuration for Hugging Face Spaces")
79
+
 
 
 
 
80
  try:
81
+ from transformers import AutoTokenizer, AutoModelForCausalLM
82
+
83
+ # Check if we have GPU (unlikely on free Spaces, but check anyway)
84
+ device = "cuda" if torch.cuda.is_available() else "cpu"
85
+ logger.info(f" Target device: {device}")
86
+
87
+ # Load tokenizer (lightweight)
88
+ logger.info(" Loading tokenizer...")
89
+ tokenizer = AutoTokenizer.from_pretrained(
90
+ LOCAL_PHI_MODEL,
91
+ trust_remote_code=True,
92
+ use_fast=True
93
+ )
94
+
95
+ # Set padding token if not present (PHI models need this)
96
+ if tokenizer.pad_token is None:
97
+ tokenizer.pad_token = tokenizer.eos_token
98
+
99
+ # Configure model loading for CPU efficiency
100
+ model_kwargs = {
101
+ "trust_remote_code": True,
102
+ "low_cpu_mem_usage": True,
103
+ "torch_dtype": torch.float32, # CPU works best with float32
104
+ }
105
+
106
+ # Try to use 8-bit quantization if available (requires bitsandbytes)
107
+ if USE_8BIT_QUANTIZATION and device == "cpu":
108
+ try:
109
+ logger.info(" Attempting 8-bit quantization for memory efficiency...")
110
+ model_kwargs["load_in_8bit"] = True
111
+ except Exception as quant_error:
112
+ logger.warning(f" 8-bit quantization unavailable: {quant_error}")
113
+ logger.info(" Falling back to float32 (will use more memory)")
114
+
115
+ # Load the model
116
+ logger.info(" Loading PHI model (this may take 30-60 seconds)...")
117
+ model = AutoModelForCausalLM.from_pretrained(
118
+ LOCAL_PHI_MODEL,
119
+ **model_kwargs
120
+ )
121
+
122
+ # Move to eval mode to disable dropout and save memory
123
+ model.eval()
124
+
125
+ # Create pipeline for generation
126
+ logger.info(" Creating text-generation pipeline...")
127
  llm_client = pipeline(
128
+ "text-generation",
129
+ model=model,
130
+ tokenizer=tokenizer,
131
+ device=0 if device == "cuda" else -1,
132
+ max_new_tokens=512,
133
+ pad_token_id=tokenizer.eos_token_id
134
  )
135
+
136
+ CONFIG["llm_model"] = LOCAL_PHI_MODEL
137
+ CONFIG["model_type"] = "phi_local"
138
+
139
+ logger.info(f"βœ… PHI model initialized successfully: {LOCAL_PHI_MODEL}")
140
+ logger.info(f" Model size: ~2.7B parameters (PHI-2) or ~3.8B (PHI-3)")
141
+ logger.info(f" Memory optimization: {'8-bit quantization' if USE_8BIT_QUANTIZATION else 'float32'}")
142
+
143
  return llm_client
144
+
145
+ except ImportError as ie:
146
+ logger.error(f"❌ Missing required library: {ie}")
147
+ logger.info(" Install with: pip install transformers accelerate bitsandbytes")
148
+ raise
149
  except Exception as e:
150
+ logger.error(f"❌ Failed to load PHI model: {str(e)}")
151
+ logger.info(" This may be due to insufficient memory on the Space")
152
+ logger.info(" Try using a smaller model or enabling 8-bit quantization")
153
+ raise Exception(f"Failed to initialize PHI LLM: {str(e)}")
154
 
155
 
156
  def remote_generate(prompt: str, max_new_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.9) -> str:
 
514
  Answer:
515
  """
516
 
517
+ logger.info(" β†’ Polishing scaffold with PHI model")
518
  try:
519
+ out = llm_client(
520
+ polish_prompt,
521
+ max_new_tokens=600,
522
+ temperature=0.72,
523
+ top_p=0.92,
524
+ do_sample=True,
525
+ repetition_penalty=1.1,
526
+ pad_token_id=llm_client.tokenizer.eos_token_id
527
+ )
528
+
529
+ # Extract and clean the polished text
530
+ if isinstance(out, list) and out:
531
+ polished = out[0].get('generated_text', '') if isinstance(out[0], dict) else str(out[0])
532
+ else:
533
+ polished = str(out)
534
+
535
+ # Remove prompt echo if present
536
+ if polish_prompt in polished:
537
+ polished = polished[len(polish_prompt):].strip()
538
  else:
 
 
539
  polished = polished.strip()
540
+
541
  except Exception as e:
542
  logger.error(f" βœ— Polishing error: {e}")
543
  return None
 
613
  llm_client,
614
  attempt: int = 1
615
  ) -> Optional[str]:
616
+ # Ensure we have a local PHI model loaded
617
+ if not llm_client:
618
+ logger.error(" β†’ PHI model not initialized")
619
  return None
620
 
621
  query_lower = query.lower()
 
658
  max_iterations = 4
659
 
660
  def call_model(prompt, max_new_tokens, temperature, top_p, repetition_penalty):
661
+ logger.info(f" β†’ PHI model call (temp={temperature}, max_new_tokens={max_new_tokens})")
662
  try:
663
+ # Call local PHI model (causal LM)
 
 
 
664
  out = llm_client(
665
  prompt,
666
  max_new_tokens=max_new_tokens,
667
  temperature=temperature,
668
  top_p=top_p,
669
  do_sample=True,
 
670
  repetition_penalty=repetition_penalty,
671
+ num_return_sequences=1,
672
+ pad_token_id=llm_client.tokenizer.eos_token_id,
673
+ eos_token_id=llm_client.tokenizer.eos_token_id
674
  )
675
+
676
+ # Extract generated text from pipeline output
677
  if isinstance(out, list) and out:
678
+ generated = out[0].get('generated_text', '') if isinstance(out[0], dict) else str(out[0])
679
+ else:
680
+ generated = str(out)
681
+
682
+ # PHI models return prompt + completion, extract only new text
683
+ if prompt in generated:
684
+ # Remove the prompt from the output
685
+ generated = generated[len(prompt):].strip()
686
+
687
+ return generated
688
+
689
  except Exception as e:
690
+ logger.error(f" βœ— PHI model call error: {e}")
691
  return ''
692
 
693
  # Build initial prompt
 
838
 
839
  if not llm_answer:
840
  logger.error(f" βœ— All 2 LLM attempts failed")
841
+ # Try scaffold-and-polish as a fallback strategy
842
+ try:
843
+ logger.info(" β†’ Attempting scaffold-and-polish using PHI model")
844
+ polished = scaffold_and_polish(query, retrieved_docs, llm_client)
845
+ if polished:
846
+ logger.info(" βœ… Scaffold-and-polish produced an answer")
847
+ return polished
848
+ except Exception as e:
849
+ logger.error(f" βœ— Scaffold-and-polish error: {e}")
 
 
 
850
 
851
  # Final fallback: extractive templated answer (guaranteed deterministic)
852
  try: