Spaces:

soxogvv
/

codellama

Runtime error

App Files Files Community

soxogvv commited on Aug 18, 2025

Commit

605e76c

verified ·

1 Parent(s): 02e2ce3

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -33

app.py CHANGED Viewed

@@ -41,30 +41,56 @@ class CodeLlamaService:
                 # Use the smallest Code Llama model that fits in 16GB
                 model_name = "codellama/CodeLlama-7b-Instruct-hf"
-                # Load with memory optimization
                 self.tokenizer = AutoTokenizer.from_pretrained(
                     model_name,
                     use_fast=True,
                     trust_remote_code=True
                 )
-                # Load model with optimizations for CPU inference
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    model_name,
-                    torch_dtype=torch.float16,
-                    low_cpu_mem_usage=True,
-                    trust_remote_code=True,
-                    device_map="auto"
-                )
-                # Create pipeline
-                self.pipeline = pipeline(
-                    "text-generation",
-                    model=self.model,
-                    tokenizer=self.tokenizer,
-                    torch_dtype=torch.float16,
-                    device_map="auto"
-                )
                 self.is_loaded = True
                 logger.info("Model loaded successfully!")
@@ -72,6 +98,16 @@ class CodeLlamaService:
             except Exception as e:
                 logger.error(f"Error loading model: {str(e)}")
                 self.is_loaded = False
             finally:
                 self.is_loading = False
@@ -84,21 +120,32 @@ class CodeLlamaService:
             # Format prompt for instruction following
             formatted_prompt = f"<s>[INST] {prompt} [/INST]"
-            # Generate response
-            outputs = self.pipeline(
-                formatted_prompt,
-                max_length=max_length,
-                temperature=temperature,
-                do_sample=True,
-                top_p=0.9,
-                repetition_penalty=1.1,
-                pad_token_id=self.tokenizer.eos_token_id,
-                eos_token_id=self.tokenizer.eos_token_id
-            )
             # Extract generated text
-            generated_text = outputs[0]['generated_text']
-            response = generated_text[len(formatted_prompt):].strip()
             # Split response into code and explanation if possible
             code, explanation = self._parse_response(response)
@@ -142,12 +189,27 @@ class CodeLlamaService:
             code_lines = []
             explanation_lines = []
             for line in lines:
-                if (line.strip().startswith(('def ', 'class ', 'import ', 'from ', 'if ', 'for ', 'while ', '    ', '\t')) or
-                    '=' in line and not line.strip().startswith('#')):
                     code_lines.append(line)
                 else:
-                    explanation_lines.append(line)
             code = '\n'.join(code_lines)
             explanation = '\n'.join(explanation_lines)

                 # Use the smallest Code Llama model that fits in 16GB
                 model_name = "codellama/CodeLlama-7b-Instruct-hf"
+                # Check if CUDA is available
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+                logger.info(f"Using device: {device}")
+                # Load tokenizer
                 self.tokenizer = AutoTokenizer.from_pretrained(
                     model_name,
                     use_fast=True,
                     trust_remote_code=True
                 )
+                # Configure model loading based on device
+                if device == "cuda":
+                    # GPU: Use float16 for memory efficiency
+                    self.model = AutoModelForCausalLM.from_pretrained(
+                        model_name,
+                        torch_dtype=torch.float16,
+                        low_cpu_mem_usage=True,
+                        trust_remote_code=True,
+                        device_map="auto"
+                    )
+                    torch_dtype = torch.float16
+                else:
+                    # CPU: Use float32 to avoid Half precision errors
+                    self.model = AutoModelForCausalLM.from_pretrained(
+                        model_name,
+                        torch_dtype=torch.float32,
+                        low_cpu_mem_usage=True,
+                        trust_remote_code=True
+                    )
+                    # Move model to CPU explicitly
+                    self.model = self.model.to('cpu')
+                    torch_dtype = torch.float32
+                # Create pipeline with appropriate settings
+                if device == "cuda":
+                    self.pipeline = pipeline(
+                        "text-generation",
+                        model=self.model,
+                        tokenizer=self.tokenizer,
+                        torch_dtype=torch_dtype,
+                        device=0  # GPU device
+                    )
+                else:
+                    self.pipeline = pipeline(
+                        "text-generation",
+                        model=self.model,
+                        tokenizer=self.tokenizer,
+                        device=-1  # CPU device
+                    )
                 self.is_loaded = True
                 logger.info("Model loaded successfully!")
             except Exception as e:
                 logger.error(f"Error loading model: {str(e)}")
                 self.is_loaded = False
+                # Clean up on failure
+                if hasattr(self, 'model') and self.model is not None:
+                    del self.model
+                if hasattr(self, 'tokenizer') and self.tokenizer is not None:
+                    del self.tokenizer
+                if hasattr(self, 'pipeline') and self.pipeline is not None:
+                    del self.pipeline
+                gc.collect()
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
             finally:
                 self.is_loading = False
             # Format prompt for instruction following
             formatted_prompt = f"<s>[INST] {prompt} [/INST]"
+            # Generate response with error handling
+            generation_kwargs = {
+                "max_new_tokens": max_length,
+                "do_sample": True if temperature > 0 else False,
+                "temperature": temperature if temperature > 0 else None,
+                "top_p": 0.9 if temperature > 0 else None,
+                "repetition_penalty": 1.1,
+                "return_full_text": False,
+                "pad_token_id": self.tokenizer.eos_token_id
+            }
+            # Remove None values to avoid warnings
+            generation_kwargs = {k: v for k, v in generation_kwargs.items() if v is not None}
+            outputs = self.pipeline(formatted_prompt, **generation_kwargs)
             # Extract generated text
+            if isinstance(outputs, list) and len(outputs) > 0:
+                if 'generated_text' in outputs[0]:
+                    response = outputs[0]['generated_text']
+                else:
+                    response = str(outputs[0])
+            else:
+                response = str(outputs)
+            response = response.strip()
             # Split response into code and explanation if possible
             code, explanation = self._parse_response(response)
             code_lines = []
             explanation_lines = []
+            in_code_block = False
             for line in lines:
+                # Simple heuristic to detect code vs explanation
+                if (line.strip().startswith(('def ', 'class ', 'import ', 'from ', 'if ', 'for ', 'while ', 'function', 'var ', 'let ', 'const ')) or
+                    line.startswith(('    ', '\t')) or
+                    ('=' in line and not line.strip().startswith('#') and not line.strip().startswith('//'))):
                     code_lines.append(line)
+                    in_code_block = True
+                elif in_code_block and line.strip() == '':
+                    code_lines.append(line)  # Keep empty lines in code blocks
                 else:
+                    if in_code_block and line.strip():
+                        # Check if this line looks like code or explanation
+                        if any(char in line for char in ['{', '}', ';', '()', '[]']) and not line.strip().endswith('.'):
+                            code_lines.append(line)
+                        else:
+                            explanation_lines.append(line)
+                            in_code_block = False
+                    else:
+                        explanation_lines.append(line)
+                        in_code_block = False
             code = '\n'.join(code_lines)
             explanation = '\n'.join(explanation_lines)