Samarth Naik commited on
Commit
3fe8360
·
1 Parent(s): 86d60a9

Switch to coding-focused models and remove GPU requirements

Browse files
Files changed (2) hide show
  1. README.md +2 -2
  2. app.py +48 -29
README.md CHANGED
@@ -8,9 +8,9 @@ app_port: 5001
8
  pinned: false
9
  ---
10
 
11
- # AI Text Generation Flask API
12
 
13
- A Flask web application that serves AI text generation models via a REST API.
14
 
15
  ## Features
16
 
 
8
  pinned: false
9
  ---
10
 
11
+ # AI Coding Assistant Flask API
12
 
13
+ A Flask web application that serves coding-focused AI models via a REST API. Optimized for code generation, debugging, and programming assistance.
14
 
15
  ## Features
16
 
app.py CHANGED
@@ -17,18 +17,18 @@ model = None
17
  tokenizer = None
18
 
19
  def load_model():
20
- """Load the Llama model and tokenizer"""
21
  global model, tokenizer
22
 
23
  try:
24
- logger.info("Loading Microsoft DialoGPT model (Llama alternative)...")
25
- # Using an ungated model that works similarly
26
- model_name = "microsoft/DialoGPT-large"
27
 
28
- # Alternative ungated models you can try:
29
- # model_name = "distilbert/distilgpt2" # Smaller, faster
30
- # model_name = "gpt2-large" # GPT-2 Large
31
- # model_name = "EleutherAI/gpt-neo-1.3B" # GPT-Neo
32
 
33
  # Load tokenizer
34
  tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -37,36 +37,47 @@ def load_model():
37
  if tokenizer.pad_token is None:
38
  tokenizer.pad_token = tokenizer.eos_token
39
 
40
- # Load model with optimizations
41
  model = AutoModelForCausalLM.from_pretrained(
42
  model_name,
43
- torch_dtype=torch.float16,
44
- device_map="auto",
45
- load_in_8bit=True, # Use 8-bit quantization to reduce memory usage
46
  trust_remote_code=True
47
  )
48
 
49
- logger.info("Model loaded successfully!")
50
 
51
  except Exception as e:
52
- logger.error(f"Error loading model: {str(e)}")
53
- # Fallback to a simpler model
54
  try:
55
- logger.info("Falling back to GPT-2...")
56
- model_name = "gpt2"
57
  tokenizer = AutoTokenizer.from_pretrained(model_name)
58
  if tokenizer.pad_token is None:
59
  tokenizer.pad_token = tokenizer.eos_token
60
 
61
  model = AutoModelForCausalLM.from_pretrained(
62
  model_name,
63
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
64
- device_map="auto" if torch.cuda.is_available() else None
 
65
  )
66
- logger.info("Fallback model loaded successfully!")
67
  except Exception as fallback_error:
68
- logger.error(f"Fallback also failed: {str(fallback_error)}")
69
- raise e
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  def generate_response(prompt, max_length=512, temperature=0.7, top_p=0.9):
72
  """Generate response using the loaded model"""
@@ -76,8 +87,13 @@ def generate_response(prompt, max_length=512, temperature=0.7, top_p=0.9):
76
  raise ValueError("Model not loaded. Please ensure the model is properly initialized.")
77
 
78
  try:
79
- # Simple prompt formatting (works for most models)
80
- formatted_prompt = f"User: {prompt}\nAssistant:"
 
 
 
 
 
81
 
82
  # Tokenize the input
83
  inputs = tokenizer.encode(formatted_prompt, return_tensors="pt")
@@ -85,7 +101,7 @@ def generate_response(prompt, max_length=512, temperature=0.7, top_p=0.9):
85
  # Move to the same device as the model
86
  inputs = inputs.to(model.device)
87
 
88
- # Generate response
89
  with torch.no_grad():
90
  outputs = model.generate(
91
  inputs,
@@ -95,15 +111,18 @@ def generate_response(prompt, max_length=512, temperature=0.7, top_p=0.9):
95
  do_sample=True,
96
  pad_token_id=tokenizer.eos_token_id,
97
  eos_token_id=tokenizer.eos_token_id,
98
- repetition_penalty=1.1
 
99
  )
100
 
101
  # Decode the response
102
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
103
 
104
- # Extract only the assistant's response
105
- if "Assistant:" in response:
106
- response = response.split("Assistant:")[-1].strip()
 
 
107
  elif formatted_prompt in response:
108
  response = response.replace(formatted_prompt, "").strip()
109
 
 
17
  tokenizer = None
18
 
19
  def load_model():
20
+ """Load the model and tokenizer"""
21
  global model, tokenizer
22
 
23
  try:
24
+ logger.info("Loading Salesforce CodeT5+ model (good for coding tasks)...")
25
+ # Using CodeT5+ which is good for coding and doesn't require GPU
26
+ model_name = "Salesforce/codet5p-770m"
27
 
28
+ # Alternative good coding models (ungated):
29
+ # model_name = "microsoft/CodeBERT-base"
30
+ # model_name = "EleutherAI/gpt-neo-1.3B" # Better general model
31
+ # model_name = "microsoft/DialoGPT-medium" # Better conversation
32
 
33
  # Load tokenizer
34
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
37
  if tokenizer.pad_token is None:
38
  tokenizer.pad_token = tokenizer.eos_token
39
 
40
+ # Load model without quantization (CPU compatible)
41
  model = AutoModelForCausalLM.from_pretrained(
42
  model_name,
43
+ torch_dtype=torch.float32, # Use float32 for CPU compatibility
44
+ device_map=None, # Let PyTorch handle device placement
 
45
  trust_remote_code=True
46
  )
47
 
48
+ logger.info("CodeT5+ model loaded successfully!")
49
 
50
  except Exception as e:
51
+ logger.error(f"Error loading CodeT5+: {str(e)}")
52
+ # Fallback to GPT-Neo (much better than GPT-2)
53
  try:
54
+ logger.info("Falling back to GPT-Neo 1.3B (better than GPT-2)...")
55
+ model_name = "EleutherAI/gpt-neo-1.3B"
56
  tokenizer = AutoTokenizer.from_pretrained(model_name)
57
  if tokenizer.pad_token is None:
58
  tokenizer.pad_token = tokenizer.eos_token
59
 
60
  model = AutoModelForCausalLM.from_pretrained(
61
  model_name,
62
+ torch_dtype=torch.float32,
63
+ device_map=None,
64
+ trust_remote_code=True
65
  )
66
+ logger.info("GPT-Neo model loaded successfully!")
67
  except Exception as fallback_error:
68
+ logger.error(f"GPT-Neo also failed, using GPT-2: {str(fallback_error)}")
69
+ # Final fallback to GPT-2
70
+ model_name = "gpt2"
71
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
72
+ if tokenizer.pad_token is None:
73
+ tokenizer.pad_token = tokenizer.eos_token
74
+
75
+ model = AutoModelForCausalLM.from_pretrained(
76
+ model_name,
77
+ torch_dtype=torch.float32,
78
+ device_map=None
79
+ )
80
+ logger.info("GPT-2 fallback loaded successfully!")
81
 
82
  def generate_response(prompt, max_length=512, temperature=0.7, top_p=0.9):
83
  """Generate response using the loaded model"""
 
87
  raise ValueError("Model not loaded. Please ensure the model is properly initialized.")
88
 
89
  try:
90
+ # Enhanced prompt formatting for coding tasks
91
+ if any(keyword in prompt.lower() for keyword in ['code', 'python', 'function', 'class', 'def ', 'import', 'javascript', 'html', 'css']):
92
+ # Coding-specific prompt format
93
+ formatted_prompt = f"# Task: {prompt}\n# Solution:\n"
94
+ else:
95
+ # General conversation format
96
+ formatted_prompt = f"Human: {prompt}\n\nAssistant: "
97
 
98
  # Tokenize the input
99
  inputs = tokenizer.encode(formatted_prompt, return_tensors="pt")
 
101
  # Move to the same device as the model
102
  inputs = inputs.to(model.device)
103
 
104
+ # Generate response with better parameters for code
105
  with torch.no_grad():
106
  outputs = model.generate(
107
  inputs,
 
111
  do_sample=True,
112
  pad_token_id=tokenizer.eos_token_id,
113
  eos_token_id=tokenizer.eos_token_id,
114
+ repetition_penalty=1.1,
115
+ no_repeat_ngram_size=2 # Avoid repetitive code
116
  )
117
 
118
  # Decode the response
119
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
120
 
121
+ # Extract only the generated response
122
+ if "# Solution:\n" in response:
123
+ response = response.split("# Solution:\n")[-1].strip()
124
+ elif "Assistant: " in response:
125
+ response = response.split("Assistant: ")[-1].strip()
126
  elif formatted_prompt in response:
127
  response = response.replace(formatted_prompt, "").strip()
128