AION Protocol Development commited on
Commit
b883a41
·
1 Parent(s): ce725ca

feat: Add Qwen2.5-Coder-32B and Phi-4-mini via HF Inference API + limit to 32K tokens

Browse files

- Added TIER 5: FREE HUGGINGFACE MODELS
- Qwen2.5-Coder-32B-Instruct (32B code specialist)
- Phi-4-mini-instruct (Microsoft efficient model)
- Changed max_tokens from 64000 to 32000 (user request - fix 400 error)
- Updated context_window to 32000 in MODEL_CONFIGS
- Updated UI text: 64,000 → 32,000 tokens
- HuggingFace Inference API provider already implemented
- Models auto-appear in dropdown via list(MODEL_CONFIGS.keys())

Files changed (1) hide show
  1. app.py +27 -6
app.py CHANGED
@@ -21,7 +21,7 @@ MODEL_CONFIGS = {
21
  "model": "claude-sonnet-4-20250514",
22
  "api_key_env": "ANTHROPIC_API_KEY",
23
  "cost_per_1M_tokens": 3.00,
24
- "context_window": 64000,
25
  "tier": "premium",
26
  "description": "Best for complex architecture"
27
  },
@@ -73,6 +73,27 @@ MODEL_CONFIGS = {
73
  "context_window": 1000000,
74
  "tier": "free-google",
75
  "description": "Experimental - Ultra-fast generation (1M context)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  }
77
  }
78
 
@@ -94,7 +115,7 @@ OUTPUT FORMAT:
94
  3. Dockerfile (if deployment mentioned)
95
  4. Brief README with usage instructions
96
 
97
- Context window: 64,000 tokens output (demo limit) - you can generate comprehensive solutions.
98
 
99
  Be complete and thorough. Focus on quality and production-readiness."""
100
 
@@ -122,7 +143,7 @@ def generate_code_with_model(prompt: str, model_name: str, temperature: float =
122
  client = anthropic.Anthropic(api_key=os.getenv(config["api_key_env"]))
123
  response = client.messages.create(
124
  model=config["model"],
125
- max_tokens=64000, # Limited for demo stability
126
  temperature=temperature,
127
  system=SYSTEM_PROMPT,
128
  messages=[{"role": "user", "content": prompt}]
@@ -170,7 +191,7 @@ def generate_code_with_model(prompt: str, model_name: str, temperature: float =
170
  model = genai.GenerativeModel(config["model"])
171
  response = model.generate_content(
172
  f"{SYSTEM_PROMPT}\n\nUser request: {prompt}",
173
- generation_config={"temperature": temperature, "max_output_tokens": 64000} # Gemini 2.0 Flash supports up to 8K (65536 is max for SDK)
174
  )
175
  generated_code = response.text
176
  input_tokens = response.usage_metadata.prompt_token_count
@@ -338,7 +359,7 @@ with gr.Blocks(
338
 
339
  **Pure prompt evaluation:** Describe your requirements in detail. The AI will decide language, framework, and architecture based on your instructions.
340
 
341
- **Context Window:** 64,000 tokens output
342
  """)
343
 
344
  with gr.Row():
@@ -386,7 +407,7 @@ with gr.Blocks(
386
 
387
  **Pure prompt evaluation:** Each model reads the same instructions and decides implementation details independently.
388
 
389
- **Context Window:** 64,000 tokens output per model
390
  """)
391
 
392
  with gr.Row():
 
21
  "model": "claude-sonnet-4-20250514",
22
  "api_key_env": "ANTHROPIC_API_KEY",
23
  "cost_per_1M_tokens": 3.00,
24
+ "context_window": 32000,
25
  "tier": "premium",
26
  "description": "Best for complex architecture"
27
  },
 
73
  "context_window": 1000000,
74
  "tier": "free-google",
75
  "description": "Experimental - Ultra-fast generation (1M context)"
76
+ },
77
+
78
+
79
+ # === TIER 5: FREE HUGGINGFACE MODELS ===
80
+ "Qwen2.5-Coder-32B 🤗": {
81
+ "provider": "huggingface",
82
+ "model": "Qwen/Qwen2.5-Coder-32B-Instruct",
83
+ "api_key_env": "HF_TOKEN",
84
+ "cost_per_1M_tokens": 0.00,
85
+ "context_window": 32768,
86
+ "tier": "free-hf",
87
+ "description": "32B code specialist via HF Inference API (FREE)"
88
+ },
89
+ "Phi-4-mini 🤗": {
90
+ "provider": "huggingface",
91
+ "model": "microsoft/Phi-4-mini-instruct",
92
+ "api_key_env": "HF_TOKEN",
93
+ "cost_per_1M_tokens": 0.00,
94
+ "context_window": 16384,
95
+ "tier": "free-hf",
96
+ "description": "Microsoft's efficient code model via HF Inference API"
97
  }
98
  }
99
 
 
115
  3. Dockerfile (if deployment mentioned)
116
  4. Brief README with usage instructions
117
 
118
+ Context window: 32,000 tokens output (demo limit) - you can generate comprehensive solutions.
119
 
120
  Be complete and thorough. Focus on quality and production-readiness."""
121
 
 
143
  client = anthropic.Anthropic(api_key=os.getenv(config["api_key_env"]))
144
  response = client.messages.create(
145
  model=config["model"],
146
+ max_tokens=32000, # Limited for demo stability
147
  temperature=temperature,
148
  system=SYSTEM_PROMPT,
149
  messages=[{"role": "user", "content": prompt}]
 
191
  model = genai.GenerativeModel(config["model"])
192
  response = model.generate_content(
193
  f"{SYSTEM_PROMPT}\n\nUser request: {prompt}",
194
+ generation_config={"temperature": temperature, "max_output_tokens": 32000} # Gemini 2.0 Flash supports up to 8K (65536 is max for SDK)
195
  )
196
  generated_code = response.text
197
  input_tokens = response.usage_metadata.prompt_token_count
 
359
 
360
  **Pure prompt evaluation:** Describe your requirements in detail. The AI will decide language, framework, and architecture based on your instructions.
361
 
362
+ **Context Window:** 32,000 tokens output
363
  """)
364
 
365
  with gr.Row():
 
407
 
408
  **Pure prompt evaluation:** Each model reads the same instructions and decides implementation details independently.
409
 
410
+ **Context Window:** 32,000 tokens output per model
411
  """)
412
 
413
  with gr.Row():