megharudushi commited on
Commit
1e22395
·
verified ·
1 Parent(s): 0768f31

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +202 -150
app.py CHANGED
@@ -6,12 +6,16 @@ Author: Matrix Agent
6
  Features:
7
  - Full OpenAI API compatibility (/v1/chat/completions)
8
  - Full Anthropic API compatibility (/v1/messages)
 
 
9
  - Optimized for coding tasks
10
  - Runs on free HF Spaces (2 vCPU, 16GB RAM)
11
 
12
  API Specifications verified against:
13
  - OpenAI: https://platform.openai.com/docs/api-reference/chat/create
14
  - Anthropic: https://docs.anthropic.com/en/api/messages
 
 
15
  """
16
 
17
  import os
@@ -36,10 +40,10 @@ from pydantic import BaseModel, Field
36
  # ============================================================================
37
 
38
  MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-Coder-1.5B-Instruct")
39
- ANTHROPIC_VERSION = "2023-06-01" # Standard Anthropic API version
40
 
41
  MODEL_ALIASES = {
42
- # OpenAI-style model names -> actual model
43
  "gpt-4": MODEL_ID,
44
  "gpt-4-turbo": MODEL_ID,
45
  "gpt-4o": MODEL_ID,
@@ -89,7 +93,6 @@ def load_model():
89
  if tokenizer.pad_token is None:
90
  tokenizer.pad_token = tokenizer.eos_token
91
 
92
- # Load with CPU optimizations for 16GB RAM
93
  model = AutoModelForCausalLM.from_pretrained(
94
  MODEL_ID,
95
  torch_dtype=torch.float32,
@@ -103,52 +106,45 @@ def load_model():
103
  return model, tokenizer
104
 
105
  # ============================================================================
106
- # Pydantic Models - OpenAI Compatible (Full Spec)
107
  # ============================================================================
108
 
109
  class OpenAIContentPart(BaseModel):
110
- """Content part for multimodal messages"""
111
- type: str # "text", "image_url"
112
  text: Optional[str] = None
113
  image_url: Optional[Dict[str, str]] = None
114
 
115
  class OpenAIMessage(BaseModel):
116
- """OpenAI message format - supports both string and array content"""
117
- role: str # "system", "user", "assistant", "tool"
118
  content: Optional[Union[str, List[OpenAIContentPart]]] = None
119
  name: Optional[str] = None
120
  tool_calls: Optional[List[Dict]] = None
121
  tool_call_id: Optional[str] = None
122
 
123
  class OpenAIResponseFormat(BaseModel):
124
- """Response format specification"""
125
- type: str = "text" # "text", "json_object", "json_schema"
126
  json_schema: Optional[Dict] = None
127
 
128
  class OpenAIChatRequest(BaseModel):
129
- """Full OpenAI Chat Completions request spec"""
130
  model: str
131
  messages: List[OpenAIMessage]
132
- # Generation parameters
133
  temperature: Optional[float] = Field(default=1.0, ge=0, le=2)
134
  top_p: Optional[float] = Field(default=1.0, ge=0, le=1)
135
  n: Optional[int] = Field(default=1, ge=1, le=10)
136
  stream: Optional[bool] = False
137
  stop: Optional[Union[str, List[str]]] = None
138
  max_tokens: Optional[int] = None
139
- max_completion_tokens: Optional[int] = None # Newer parameter
140
  presence_penalty: Optional[float] = Field(default=0, ge=-2, le=2)
141
  frequency_penalty: Optional[float] = Field(default=0, ge=-2, le=2)
142
  logit_bias: Optional[Dict[str, float]] = None
143
  logprobs: Optional[bool] = False
144
  top_logprobs: Optional[int] = None
145
- # Additional parameters
146
  user: Optional[str] = None
147
  seed: Optional[int] = None
148
  tools: Optional[List[Dict]] = None
149
  tool_choice: Optional[Union[str, Dict]] = None
150
  response_format: Optional[OpenAIResponseFormat] = None
151
- # Stream options
152
  stream_options: Optional[Dict] = None
153
 
154
  class OpenAIChoiceMessage(BaseModel):
@@ -159,7 +155,7 @@ class OpenAIChoiceMessage(BaseModel):
159
  class OpenAIChoice(BaseModel):
160
  index: int
161
  message: OpenAIChoiceMessage
162
- finish_reason: Optional[str] = None # "stop", "length", "tool_calls", "content_filter"
163
  logprobs: Optional[Dict] = None
164
 
165
  class OpenAIStreamChoice(BaseModel):
@@ -176,7 +172,6 @@ class OpenAIUsage(BaseModel):
176
  completion_tokens_details: Optional[Dict] = None
177
 
178
  class OpenAIChatResponse(BaseModel):
179
- """Full OpenAI Chat Completions response spec"""
180
  id: str
181
  object: str = "chat.completion"
182
  created: int
@@ -186,14 +181,6 @@ class OpenAIChatResponse(BaseModel):
186
  system_fingerprint: Optional[str] = None
187
  service_tier: Optional[str] = None
188
 
189
- class OpenAIStreamResponse(BaseModel):
190
- id: str
191
- object: str = "chat.completion.chunk"
192
- created: int
193
- model: str
194
- choices: List[OpenAIStreamChoice]
195
- system_fingerprint: Optional[str] = None
196
-
197
  class OpenAIModelInfo(BaseModel):
198
  id: str
199
  object: str = "model"
@@ -205,62 +192,52 @@ class OpenAIModelsResponse(BaseModel):
205
  data: List[OpenAIModelInfo]
206
 
207
  # ============================================================================
208
- # Pydantic Models - Anthropic Compatible (Full Spec)
209
  # ============================================================================
210
 
211
  class AnthropicTextBlock(BaseModel):
212
- """Text content block"""
213
  type: str = "text"
214
  text: str
215
 
216
  class AnthropicImageSource(BaseModel):
217
- """Image source for vision"""
218
  type: str = "base64"
219
- media_type: str # "image/jpeg", "image/png", "image/webp", "image/gif"
220
  data: str
221
 
222
  class AnthropicImageBlock(BaseModel):
223
- """Image content block"""
224
  type: str = "image"
225
  source: AnthropicImageSource
226
 
227
- class AnthropicToolUseBlock(BaseModel):
228
- """Tool use content block"""
229
- type: str = "tool_use"
230
- id: str
231
- name: str
232
- input: Dict
233
-
234
- class AnthropicToolResultBlock(BaseModel):
235
- """Tool result content block"""
236
- type: str = "tool_result"
237
- tool_use_id: str
238
- content: Union[str, List[Dict]]
239
 
240
- # Union type for all content blocks
241
- AnthropicContentBlock = Union[AnthropicTextBlock, AnthropicImageBlock, Dict]
242
 
243
  class AnthropicMessage(BaseModel):
244
- """Anthropic message format"""
245
  role: str # "user", "assistant"
246
  content: Union[str, List[AnthropicContentBlock]]
247
 
248
  class AnthropicTool(BaseModel):
249
- """Tool definition"""
250
  name: str
251
  description: Optional[str] = None
252
  input_schema: Dict
253
 
254
  class AnthropicToolChoice(BaseModel):
255
- """Tool choice specification"""
256
- type: str # "auto", "any", "tool"
257
  name: Optional[str] = None
258
 
 
 
 
 
 
259
  class AnthropicRequest(BaseModel):
260
- """Full Anthropic Messages API request spec"""
261
  model: str
262
  messages: List[AnthropicMessage]
263
- max_tokens: int # Required in Anthropic API
264
  # Optional parameters
265
  system: Optional[Union[str, List[Dict]]] = None
266
  temperature: Optional[float] = Field(default=1.0, ge=0, le=1)
@@ -271,12 +248,16 @@ class AnthropicRequest(BaseModel):
271
  # Tool use
272
  tools: Optional[List[AnthropicTool]] = None
273
  tool_choice: Optional[AnthropicToolChoice] = None
 
 
274
  # Metadata
275
  metadata: Optional[Dict] = None
276
 
277
  class AnthropicResponseContent(BaseModel):
278
  type: str = "text"
279
  text: Optional[str] = None
 
 
280
  # For tool_use
281
  id: Optional[str] = None
282
  name: Optional[str] = None
@@ -287,13 +268,12 @@ class AnthropicUsage(BaseModel):
287
  output_tokens: int
288
 
289
  class AnthropicResponse(BaseModel):
290
- """Full Anthropic Messages API response spec"""
291
  id: str
292
  type: str = "message"
293
  role: str = "assistant"
294
  model: str
295
  content: List[AnthropicResponseContent]
296
- stop_reason: Optional[str] = None # "end_turn", "max_tokens", "stop_sequence", "tool_use"
297
  stop_sequence: Optional[str] = None
298
  usage: AnthropicUsage
299
 
@@ -302,7 +282,6 @@ class AnthropicResponse(BaseModel):
302
  # ============================================================================
303
 
304
  def extract_text_from_openai_content(content: Union[str, List, None]) -> str:
305
- """Extract text from OpenAI message content (string or array)"""
306
  if content is None:
307
  return ""
308
  if isinstance(content, str):
@@ -319,7 +298,6 @@ def extract_text_from_openai_content(content: Union[str, List, None]) -> str:
319
  return str(content)
320
 
321
  def extract_text_from_anthropic_content(content: Union[str, List]) -> str:
322
- """Extract text from Anthropic message content (string or array)"""
323
  if isinstance(content, str):
324
  return content
325
  if isinstance(content, list):
@@ -328,19 +306,20 @@ def extract_text_from_anthropic_content(content: Union[str, List]) -> str:
328
  if isinstance(block, dict):
329
  if block.get("type") == "text":
330
  text_parts.append(block.get("text", ""))
331
- elif hasattr(block, "type") and block.type == "text":
332
- text_parts.append(block.text or "")
 
 
 
333
  return "\n".join(text_parts)
334
  return str(content)
335
 
336
  def extract_system_prompt_anthropic(system: Union[str, List[Dict], None]) -> str:
337
- """Extract system prompt from Anthropic format"""
338
  if system is None:
339
  return ""
340
  if isinstance(system, str):
341
  return system
342
  if isinstance(system, list):
343
- # System can be array of text blocks
344
  text_parts = []
345
  for block in system:
346
  if isinstance(block, dict) and block.get("type") == "text":
@@ -348,15 +327,40 @@ def extract_system_prompt_anthropic(system: Union[str, List[Dict], None]) -> str
348
  return "\n".join(text_parts)
349
  return ""
350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  # ============================================================================
352
- # Message Formatting
353
  # ============================================================================
354
 
355
  def format_messages_for_model(
356
  messages: List[Dict],
357
- system_prompt: Optional[str] = None
 
358
  ) -> str:
359
- """Format messages for the model using chat template"""
 
 
 
360
  formatted_messages = []
361
 
362
  if system_prompt:
@@ -366,7 +370,6 @@ def format_messages_for_model(
366
  role = msg.get("role", "user")
367
  content = msg.get("content", "")
368
 
369
- # Map tool role to assistant for compatibility
370
  if role == "tool":
371
  role = "user"
372
 
@@ -375,15 +378,19 @@ def format_messages_for_model(
375
  # Use tokenizer's chat template if available
376
  if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template:
377
  try:
378
- return tokenizer.apply_chat_template(
379
  formatted_messages,
380
  tokenize=False,
381
  add_generation_prompt=True
382
  )
 
 
 
 
383
  except Exception:
384
  pass
385
 
386
- # Fallback: Simple format
387
  prompt = ""
388
  for msg in formatted_messages:
389
  role = msg["role"]
@@ -395,10 +402,15 @@ def format_messages_for_model(
395
  elif role == "assistant":
396
  prompt += f"<|assistant|>\n{content}\n"
397
  prompt += "<|assistant|>\n"
 
 
 
 
 
398
  return prompt
399
 
400
  # ============================================================================
401
- # Generation Logic
402
  # ============================================================================
403
 
404
  def generate_response(
@@ -408,15 +420,16 @@ def generate_response(
408
  top_p: float = 0.95,
409
  top_k: Optional[int] = None,
410
  stop: Optional[List[str]] = None,
411
- ) -> tuple[str, int, int, str]:
 
 
412
  """
413
- Generate response from the model
414
- Returns: (response_text, input_tokens, output_tokens, stop_reason)
415
  """
416
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
417
  input_length = inputs.input_ids.shape[1]
418
 
419
- # Generation config
420
  gen_kwargs = {
421
  "max_new_tokens": max_tokens,
422
  "temperature": max(temperature, 0.01),
@@ -432,12 +445,20 @@ def generate_response(
432
  with torch.no_grad():
433
  outputs = model.generate(inputs.input_ids, **gen_kwargs)
434
 
435
- # Decode only the new tokens
436
  generated_tokens = outputs[0][input_length:]
437
  response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
438
 
439
  output_length = len(generated_tokens)
440
- stop_reason = "stop" # Default
 
 
 
 
 
 
 
 
 
441
 
442
  # Handle stop sequences
443
  if stop:
@@ -447,11 +468,10 @@ def generate_response(
447
  stop_reason = "stop"
448
  break
449
 
450
- # Check if max tokens reached
451
  if output_length >= max_tokens:
452
  stop_reason = "length"
453
 
454
- return response_text.strip(), input_length, output_length, stop_reason
455
 
456
  async def generate_stream(
457
  prompt: str,
@@ -492,14 +512,13 @@ async def generate_stream(
492
 
493
  @asynccontextmanager
494
  async def lifespan(app: FastAPI):
495
- """Load model on startup"""
496
  load_model()
497
  yield
498
 
499
  app = FastAPI(
500
  title="Free Coding API",
501
- description="OpenAI & Anthropic compatible API for coding tasks",
502
- version="1.0.0",
503
  lifespan=lifespan
504
  )
505
 
@@ -516,7 +535,6 @@ app.add_middleware(
516
  # ============================================================================
517
 
518
  def verify_api_key(authorization: Optional[str] = None) -> bool:
519
- """Simple API key verification"""
520
  if not API_KEY or API_KEY == "":
521
  return True
522
 
@@ -536,7 +554,6 @@ def verify_api_key(authorization: Optional[str] = None) -> bool:
536
 
537
  @app.get("/v1/models")
538
  async def list_models():
539
- """List available models (OpenAI compatible)"""
540
  models = [
541
  OpenAIModelInfo(id=alias, created=int(time.time()))
542
  for alias in MODEL_ALIASES.keys()
@@ -545,7 +562,6 @@ async def list_models():
545
 
546
  @app.get("/v1/models/{model_id}")
547
  async def get_model(model_id: str):
548
- """Get model info"""
549
  if model_id in MODEL_ALIASES or model_id == MODEL_ID:
550
  return OpenAIModelInfo(id=model_id, created=int(time.time()))
551
  raise HTTPException(status_code=404, detail="Model not found")
@@ -555,7 +571,7 @@ async def openai_chat_completions(
555
  request: OpenAIChatRequest,
556
  authorization: Optional[str] = Header(None),
557
  ):
558
- """OpenAI-compatible chat completions endpoint - Full spec compliance"""
559
 
560
  if not verify_api_key(authorization):
561
  raise HTTPException(status_code=401, detail="Invalid API key")
@@ -566,7 +582,10 @@ async def openai_chat_completions(
566
  content = extract_text_from_openai_content(m.content)
567
  messages.append({"role": m.role, "content": content})
568
 
569
- # Extract system message if present
 
 
 
570
  system_prompt = None
571
  filtered_messages = []
572
  for msg in messages:
@@ -575,12 +594,10 @@ async def openai_chat_completions(
575
  else:
576
  filtered_messages.append(msg)
577
 
578
- prompt = format_messages_for_model(filtered_messages, system_prompt=system_prompt)
579
 
580
- # Determine max tokens
581
  max_tokens = request.max_completion_tokens or request.max_tokens or MAX_TOKENS_DEFAULT
582
 
583
- # Handle stop sequences
584
  stop_sequences = None
585
  if request.stop:
586
  stop_sequences = [request.stop] if isinstance(request.stop, str) else request.stop
@@ -590,9 +607,7 @@ async def openai_chat_completions(
590
  created_time = int(time.time())
591
 
592
  if request.stream:
593
- # OpenAI Streaming format
594
  async def stream_generator():
595
- # First chunk with role
596
  first_chunk = {
597
  "id": request_id,
598
  "object": "chat.completion.chunk",
@@ -601,14 +616,13 @@ async def openai_chat_completions(
601
  "system_fingerprint": system_fingerprint,
602
  "choices": [{
603
  "index": 0,
604
- "delta": {"role": "assistant", "content": ""},
605
  "logprobs": None,
606
  "finish_reason": None
607
  }]
608
  }
609
  yield f"data: {json.dumps(first_chunk)}\n\n"
610
 
611
- # Stream content
612
  async for token in generate_stream(
613
  prompt,
614
  max_tokens=max_tokens,
@@ -630,7 +644,6 @@ async def openai_chat_completions(
630
  }
631
  yield f"data: {json.dumps(chunk)}\n\n"
632
 
633
- # Final chunk with finish_reason
634
  final_chunk = {
635
  "id": request_id,
636
  "object": "chat.completion.chunk",
@@ -646,7 +659,6 @@ async def openai_chat_completions(
646
  }
647
  yield f"data: {json.dumps(final_chunk)}\n\n"
648
 
649
- # Usage chunk if requested
650
  if request.stream_options and request.stream_options.get("include_usage"):
651
  usage_chunk = {
652
  "id": request_id,
@@ -654,11 +666,7 @@ async def openai_chat_completions(
654
  "created": created_time,
655
  "model": request.model,
656
  "choices": [],
657
- "usage": {
658
- "prompt_tokens": 0,
659
- "completion_tokens": 0,
660
- "total_tokens": 0
661
- }
662
  }
663
  yield f"data: {json.dumps(usage_chunk)}\n\n"
664
 
@@ -667,15 +675,11 @@ async def openai_chat_completions(
667
  return StreamingResponse(
668
  stream_generator(),
669
  media_type="text/event-stream",
670
- headers={
671
- "Cache-Control": "no-cache",
672
- "Connection": "keep-alive",
673
- "X-Accel-Buffering": "no"
674
- }
675
  )
676
 
677
- # Non-streaming response
678
- response_text, input_tokens, output_tokens, stop_reason = generate_response(
679
  prompt,
680
  max_tokens=max_tokens,
681
  temperature=request.temperature or 1.0,
@@ -683,7 +687,9 @@ async def openai_chat_completions(
683
  stop=stop_sequences,
684
  )
685
 
686
- # Map stop reason to OpenAI format
 
 
687
  openai_finish_reason = "stop" if stop_reason == "stop" else "length"
688
 
689
  return OpenAIChatResponse(
@@ -694,7 +700,7 @@ async def openai_chat_completions(
694
  choices=[
695
  OpenAIChoice(
696
  index=0,
697
- message=OpenAIChoiceMessage(role="assistant", content=response_text),
698
  finish_reason=openai_finish_reason,
699
  logprobs=None
700
  )
@@ -707,7 +713,7 @@ async def openai_chat_completions(
707
  )
708
 
709
  # ============================================================================
710
- # Anthropic Compatible Endpoints
711
  # ============================================================================
712
 
713
  @app.post("/v1/messages")
@@ -717,9 +723,8 @@ async def anthropic_messages(
717
  x_api_key: Optional[str] = Header(None, alias="x-api-key"),
718
  anthropic_version: Optional[str] = Header(None, alias="anthropic-version"),
719
  ):
720
- """Anthropic-compatible messages endpoint - Full spec compliance"""
721
 
722
- # Anthropic uses x-api-key header
723
  auth_key = x_api_key or authorization
724
  if not verify_api_key(auth_key):
725
  raise HTTPException(status_code=401, detail="Invalid API key")
@@ -730,19 +735,30 @@ async def anthropic_messages(
730
  content = extract_text_from_anthropic_content(m.content)
731
  messages.append({"role": m.role, "content": content})
732
 
 
 
 
733
  # Extract system prompt
734
  system_prompt = extract_system_prompt_anthropic(request.system)
735
 
736
- prompt = format_messages_for_model(messages, system_prompt=system_prompt)
 
 
 
 
 
 
 
 
 
737
 
738
  request_id = f"msg_{uuid.uuid4().hex[:24]}"
739
 
740
  if request.stream:
741
- # Anthropic streaming format (Server-Sent Events)
742
  async def stream_generator():
743
- input_tokens = 0 # Would be calculated from prompt
744
 
745
- # 1. message_start event
746
  message_start = {
747
  "type": "message_start",
748
  "message": {
@@ -753,26 +769,55 @@ async def anthropic_messages(
753
  "content": [],
754
  "stop_reason": None,
755
  "stop_sequence": None,
756
- "usage": {
757
- "input_tokens": input_tokens,
758
- "output_tokens": 0
759
- }
760
  }
761
  }
762
  yield f"event: message_start\ndata: {json.dumps(message_start)}\n\n"
763
 
764
- # 2. content_block_start event
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
765
  content_block_start = {
766
  "type": "content_block_start",
767
- "index": 0,
768
- "content_block": {
769
- "type": "text",
770
- "text": ""
771
- }
772
  }
773
  yield f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n"
774
 
775
- # 3. Stream content_block_delta events
 
 
 
 
 
 
 
 
 
776
  output_tokens = 0
777
  async for token in generate_stream(
778
  prompt,
@@ -784,59 +829,61 @@ async def anthropic_messages(
784
  output_tokens += 1
785
  delta = {
786
  "type": "content_block_delta",
787
- "index": 0,
788
- "delta": {
789
- "type": "text_delta",
790
- "text": token
791
- }
792
  }
793
  yield f"event: content_block_delta\ndata: {json.dumps(delta)}\n\n"
794
 
795
- # 4. content_block_stop event
796
- content_block_stop = {
797
- "type": "content_block_stop",
798
- "index": 0
799
- }
800
  yield f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n"
801
 
802
- # 5. message_delta event
803
  message_delta = {
804
  "type": "message_delta",
805
- "delta": {
806
- "stop_reason": "end_turn",
807
- "stop_sequence": None
808
- },
809
- "usage": {
810
- "output_tokens": output_tokens
811
- }
812
  }
813
  yield f"event: message_delta\ndata: {json.dumps(message_delta)}\n\n"
814
 
815
- # 6. message_stop event
816
  message_stop = {"type": "message_stop"}
817
  yield f"event: message_stop\ndata: {json.dumps(message_stop)}\n\n"
818
 
819
  return StreamingResponse(
820
  stream_generator(),
821
  media_type="text/event-stream",
822
- headers={
823
- "Cache-Control": "no-cache",
824
- "Connection": "keep-alive",
825
- "X-Accel-Buffering": "no"
826
- }
827
  )
828
 
829
  # Non-streaming response
830
- response_text, input_tokens, output_tokens, stop_reason = generate_response(
831
  prompt,
832
  max_tokens=request.max_tokens,
833
  temperature=request.temperature or 1.0,
834
  top_p=request.top_p or 0.999,
835
  top_k=request.top_k,
836
  stop=request.stop_sequences,
 
 
837
  )
838
 
839
- # Map stop reason to Anthropic format
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
840
  anthropic_stop_reason = "end_turn"
841
  stop_sequence_used = None
842
  if stop_reason == "length":
@@ -851,7 +898,7 @@ async def anthropic_messages(
851
  return AnthropicResponse(
852
  id=request_id,
853
  model=request.model,
854
- content=[AnthropicResponseContent(type="text", text=response_text)],
855
  stop_reason=anthropic_stop_reason,
856
  stop_sequence=stop_sequence_used,
857
  usage=AnthropicUsage(
@@ -868,8 +915,13 @@ async def anthropic_messages(
868
  async def root():
869
  return {
870
  "name": "Free Coding API",
871
- "version": "1.0.0",
872
  "model": MODEL_ID,
 
 
 
 
 
873
  "compatibility": {
874
  "openai": "v1 Chat Completions API",
875
  "anthropic": "Messages API (2023-06-01)"
 
6
  Features:
7
  - Full OpenAI API compatibility (/v1/chat/completions)
8
  - Full Anthropic API compatibility (/v1/messages)
9
+ - Prefill Response Support (assistant message prefix for output control)
10
+ - Thinking/Reasoning Content Block Support
11
  - Optimized for coding tasks
12
  - Runs on free HF Spaces (2 vCPU, 16GB RAM)
13
 
14
  API Specifications verified against:
15
  - OpenAI: https://platform.openai.com/docs/api-reference/chat/create
16
  - Anthropic: https://docs.anthropic.com/en/api/messages
17
+ - Prefill: https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/prefill-claudes-response
18
+ - MiniMax Anthropic: https://platform.minimax.io/docs/api-reference/text-anthropic-api
19
  """
20
 
21
  import os
 
40
  # ============================================================================
41
 
42
  MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-Coder-1.5B-Instruct")
43
+ ANTHROPIC_VERSION = "2023-06-01"
44
 
45
  MODEL_ALIASES = {
46
+ # OpenAI-style model names
47
  "gpt-4": MODEL_ID,
48
  "gpt-4-turbo": MODEL_ID,
49
  "gpt-4o": MODEL_ID,
 
93
  if tokenizer.pad_token is None:
94
  tokenizer.pad_token = tokenizer.eos_token
95
 
 
96
  model = AutoModelForCausalLM.from_pretrained(
97
  MODEL_ID,
98
  torch_dtype=torch.float32,
 
106
  return model, tokenizer
107
 
108
  # ============================================================================
109
+ # Pydantic Models - OpenAI Compatible
110
  # ============================================================================
111
 
112
  class OpenAIContentPart(BaseModel):
113
+ type: str
 
114
  text: Optional[str] = None
115
  image_url: Optional[Dict[str, str]] = None
116
 
117
  class OpenAIMessage(BaseModel):
118
+ role: str
 
119
  content: Optional[Union[str, List[OpenAIContentPart]]] = None
120
  name: Optional[str] = None
121
  tool_calls: Optional[List[Dict]] = None
122
  tool_call_id: Optional[str] = None
123
 
124
  class OpenAIResponseFormat(BaseModel):
125
+ type: str = "text"
 
126
  json_schema: Optional[Dict] = None
127
 
128
  class OpenAIChatRequest(BaseModel):
 
129
  model: str
130
  messages: List[OpenAIMessage]
 
131
  temperature: Optional[float] = Field(default=1.0, ge=0, le=2)
132
  top_p: Optional[float] = Field(default=1.0, ge=0, le=1)
133
  n: Optional[int] = Field(default=1, ge=1, le=10)
134
  stream: Optional[bool] = False
135
  stop: Optional[Union[str, List[str]]] = None
136
  max_tokens: Optional[int] = None
137
+ max_completion_tokens: Optional[int] = None
138
  presence_penalty: Optional[float] = Field(default=0, ge=-2, le=2)
139
  frequency_penalty: Optional[float] = Field(default=0, ge=-2, le=2)
140
  logit_bias: Optional[Dict[str, float]] = None
141
  logprobs: Optional[bool] = False
142
  top_logprobs: Optional[int] = None
 
143
  user: Optional[str] = None
144
  seed: Optional[int] = None
145
  tools: Optional[List[Dict]] = None
146
  tool_choice: Optional[Union[str, Dict]] = None
147
  response_format: Optional[OpenAIResponseFormat] = None
 
148
  stream_options: Optional[Dict] = None
149
 
150
  class OpenAIChoiceMessage(BaseModel):
 
155
  class OpenAIChoice(BaseModel):
156
  index: int
157
  message: OpenAIChoiceMessage
158
+ finish_reason: Optional[str] = None
159
  logprobs: Optional[Dict] = None
160
 
161
  class OpenAIStreamChoice(BaseModel):
 
172
  completion_tokens_details: Optional[Dict] = None
173
 
174
  class OpenAIChatResponse(BaseModel):
 
175
  id: str
176
  object: str = "chat.completion"
177
  created: int
 
181
  system_fingerprint: Optional[str] = None
182
  service_tier: Optional[str] = None
183
 
 
 
 
 
 
 
 
 
184
  class OpenAIModelInfo(BaseModel):
185
  id: str
186
  object: str = "model"
 
192
  data: List[OpenAIModelInfo]
193
 
194
  # ============================================================================
195
+ # Pydantic Models - Anthropic Compatible (with Thinking & Prefill support)
196
  # ============================================================================
197
 
198
  class AnthropicTextBlock(BaseModel):
 
199
  type: str = "text"
200
  text: str
201
 
202
  class AnthropicImageSource(BaseModel):
 
203
  type: str = "base64"
204
+ media_type: str
205
  data: str
206
 
207
  class AnthropicImageBlock(BaseModel):
 
208
  type: str = "image"
209
  source: AnthropicImageSource
210
 
211
+ class AnthropicThinkingBlock(BaseModel):
212
+ """Thinking/reasoning content block"""
213
+ type: str = "thinking"
214
+ thinking: str
 
 
 
 
 
 
 
 
215
 
216
+ AnthropicContentBlock = Union[AnthropicTextBlock, AnthropicImageBlock, AnthropicThinkingBlock, Dict]
 
217
 
218
  class AnthropicMessage(BaseModel):
 
219
  role: str # "user", "assistant"
220
  content: Union[str, List[AnthropicContentBlock]]
221
 
222
  class AnthropicTool(BaseModel):
 
223
  name: str
224
  description: Optional[str] = None
225
  input_schema: Dict
226
 
227
  class AnthropicToolChoice(BaseModel):
228
+ type: str
 
229
  name: Optional[str] = None
230
 
231
+ class AnthropicThinkingConfig(BaseModel):
232
+ """Configuration for thinking/reasoning mode"""
233
+ type: str = "enabled" # "enabled" or "disabled"
234
+ budget_tokens: Optional[int] = None # Token budget for thinking
235
+
236
  class AnthropicRequest(BaseModel):
237
+ """Full Anthropic Messages API request with thinking & prefill support"""
238
  model: str
239
  messages: List[AnthropicMessage]
240
+ max_tokens: int
241
  # Optional parameters
242
  system: Optional[Union[str, List[Dict]]] = None
243
  temperature: Optional[float] = Field(default=1.0, ge=0, le=1)
 
248
  # Tool use
249
  tools: Optional[List[AnthropicTool]] = None
250
  tool_choice: Optional[AnthropicToolChoice] = None
251
+ # Thinking/reasoning support
252
+ thinking: Optional[AnthropicThinkingConfig] = None
253
  # Metadata
254
  metadata: Optional[Dict] = None
255
 
256
  class AnthropicResponseContent(BaseModel):
257
  type: str = "text"
258
  text: Optional[str] = None
259
+ # For thinking blocks
260
+ thinking: Optional[str] = None
261
  # For tool_use
262
  id: Optional[str] = None
263
  name: Optional[str] = None
 
268
  output_tokens: int
269
 
270
  class AnthropicResponse(BaseModel):
 
271
  id: str
272
  type: str = "message"
273
  role: str = "assistant"
274
  model: str
275
  content: List[AnthropicResponseContent]
276
+ stop_reason: Optional[str] = None
277
  stop_sequence: Optional[str] = None
278
  usage: AnthropicUsage
279
 
 
282
  # ============================================================================
283
 
284
  def extract_text_from_openai_content(content: Union[str, List, None]) -> str:
 
285
  if content is None:
286
  return ""
287
  if isinstance(content, str):
 
298
  return str(content)
299
 
300
  def extract_text_from_anthropic_content(content: Union[str, List]) -> str:
 
301
  if isinstance(content, str):
302
  return content
303
  if isinstance(content, list):
 
306
  if isinstance(block, dict):
307
  if block.get("type") == "text":
308
  text_parts.append(block.get("text", ""))
309
+ elif block.get("type") == "thinking":
310
+ pass # Skip thinking blocks in extraction
311
+ elif hasattr(block, "type"):
312
+ if block.type == "text":
313
+ text_parts.append(block.text or "")
314
  return "\n".join(text_parts)
315
  return str(content)
316
 
317
  def extract_system_prompt_anthropic(system: Union[str, List[Dict], None]) -> str:
 
318
  if system is None:
319
  return ""
320
  if isinstance(system, str):
321
  return system
322
  if isinstance(system, list):
 
323
  text_parts = []
324
  for block in system:
325
  if isinstance(block, dict) and block.get("type") == "text":
 
327
  return "\n".join(text_parts)
328
  return ""
329
 
330
+ def extract_prefill_from_messages(messages: List[Dict]) -> tuple[List[Dict], str]:
331
+ """
332
+ Extract prefill content if the last message is from assistant.
333
+ Returns (messages_without_prefill, prefill_text)
334
+
335
+ Prefill allows controlling output by providing initial assistant response.
336
+ See: https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/prefill-claudes-response
337
+ """
338
+ if not messages:
339
+ return messages, ""
340
+
341
+ last_msg = messages[-1]
342
+ if last_msg.get("role") == "assistant":
343
+ prefill = last_msg.get("content", "")
344
+ # Prefill cannot end with trailing whitespace
345
+ if isinstance(prefill, str):
346
+ prefill = prefill.rstrip()
347
+ return messages[:-1], prefill
348
+
349
+ return messages, ""
350
+
351
  # ============================================================================
352
+ # Message Formatting with Prefill Support
353
  # ============================================================================
354
 
355
  def format_messages_for_model(
356
  messages: List[Dict],
357
+ system_prompt: Optional[str] = None,
358
+ prefill: str = ""
359
  ) -> str:
360
+ """
361
+ Format messages for the model using chat template.
362
+ Supports prefill for controlling output format.
363
+ """
364
  formatted_messages = []
365
 
366
  if system_prompt:
 
370
  role = msg.get("role", "user")
371
  content = msg.get("content", "")
372
 
 
373
  if role == "tool":
374
  role = "user"
375
 
 
378
  # Use tokenizer's chat template if available
379
  if hasattr(tokenizer, 'apply_chat_template') and tokenizer.chat_template:
380
  try:
381
+ prompt = tokenizer.apply_chat_template(
382
  formatted_messages,
383
  tokenize=False,
384
  add_generation_prompt=True
385
  )
386
+ # Append prefill if provided
387
+ if prefill:
388
+ prompt = prompt + prefill
389
+ return prompt
390
  except Exception:
391
  pass
392
 
393
+ # Fallback format
394
  prompt = ""
395
  for msg in formatted_messages:
396
  role = msg["role"]
 
402
  elif role == "assistant":
403
  prompt += f"<|assistant|>\n{content}\n"
404
  prompt += "<|assistant|>\n"
405
+
406
+ # Append prefill
407
+ if prefill:
408
+ prompt = prompt + prefill
409
+
410
  return prompt
411
 
412
  # ============================================================================
413
+ # Generation Logic with Thinking Support
414
  # ============================================================================
415
 
416
  def generate_response(
 
420
  top_p: float = 0.95,
421
  top_k: Optional[int] = None,
422
  stop: Optional[List[str]] = None,
423
+ enable_thinking: bool = False,
424
+ thinking_budget: int = 512,
425
+ ) -> tuple[str, str, int, int, str]:
426
  """
427
+ Generate response from the model.
428
+ Returns: (response_text, thinking_text, input_tokens, output_tokens, stop_reason)
429
  """
430
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096)
431
  input_length = inputs.input_ids.shape[1]
432
 
 
433
  gen_kwargs = {
434
  "max_new_tokens": max_tokens,
435
  "temperature": max(temperature, 0.01),
 
445
  with torch.no_grad():
446
  outputs = model.generate(inputs.input_ids, **gen_kwargs)
447
 
 
448
  generated_tokens = outputs[0][input_length:]
449
  response_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
450
 
451
  output_length = len(generated_tokens)
452
+ stop_reason = "stop"
453
+ thinking_text = ""
454
+
455
+ # Simulate thinking by extracting <think>...</think> blocks if present
456
+ if enable_thinking and "<think>" in response_text:
457
+ import re
458
+ think_match = re.search(r"<think>(.*?)</think>", response_text, re.DOTALL)
459
+ if think_match:
460
+ thinking_text = think_match.group(1).strip()
461
+ response_text = re.sub(r"<think>.*?</think>", "", response_text, flags=re.DOTALL).strip()
462
 
463
  # Handle stop sequences
464
  if stop:
 
468
  stop_reason = "stop"
469
  break
470
 
 
471
  if output_length >= max_tokens:
472
  stop_reason = "length"
473
 
474
+ return response_text.strip(), thinking_text, input_length, output_length, stop_reason
475
 
476
  async def generate_stream(
477
  prompt: str,
 
512
 
513
  @asynccontextmanager
514
  async def lifespan(app: FastAPI):
 
515
  load_model()
516
  yield
517
 
518
  app = FastAPI(
519
  title="Free Coding API",
520
+ description="OpenAI & Anthropic compatible API with Prefill & Thinking support",
521
+ version="1.1.0",
522
  lifespan=lifespan
523
  )
524
 
 
535
  # ============================================================================
536
 
537
  def verify_api_key(authorization: Optional[str] = None) -> bool:
 
538
  if not API_KEY or API_KEY == "":
539
  return True
540
 
 
554
 
555
  @app.get("/v1/models")
556
  async def list_models():
 
557
  models = [
558
  OpenAIModelInfo(id=alias, created=int(time.time()))
559
  for alias in MODEL_ALIASES.keys()
 
562
 
563
  @app.get("/v1/models/{model_id}")
564
  async def get_model(model_id: str):
 
565
  if model_id in MODEL_ALIASES or model_id == MODEL_ID:
566
  return OpenAIModelInfo(id=model_id, created=int(time.time()))
567
  raise HTTPException(status_code=404, detail="Model not found")
 
571
  request: OpenAIChatRequest,
572
  authorization: Optional[str] = Header(None),
573
  ):
574
+ """OpenAI-compatible chat completions with prefill support"""
575
 
576
  if not verify_api_key(authorization):
577
  raise HTTPException(status_code=401, detail="Invalid API key")
 
582
  content = extract_text_from_openai_content(m.content)
583
  messages.append({"role": m.role, "content": content})
584
 
585
+ # Check for prefill (last assistant message)
586
+ messages, prefill = extract_prefill_from_messages(messages)
587
+
588
+ # Extract system message
589
  system_prompt = None
590
  filtered_messages = []
591
  for msg in messages:
 
594
  else:
595
  filtered_messages.append(msg)
596
 
597
+ prompt = format_messages_for_model(filtered_messages, system_prompt=system_prompt, prefill=prefill)
598
 
 
599
  max_tokens = request.max_completion_tokens or request.max_tokens or MAX_TOKENS_DEFAULT
600
 
 
601
  stop_sequences = None
602
  if request.stop:
603
  stop_sequences = [request.stop] if isinstance(request.stop, str) else request.stop
 
607
  created_time = int(time.time())
608
 
609
  if request.stream:
 
610
  async def stream_generator():
 
611
  first_chunk = {
612
  "id": request_id,
613
  "object": "chat.completion.chunk",
 
616
  "system_fingerprint": system_fingerprint,
617
  "choices": [{
618
  "index": 0,
619
+ "delta": {"role": "assistant", "content": prefill}, # Include prefill in first chunk
620
  "logprobs": None,
621
  "finish_reason": None
622
  }]
623
  }
624
  yield f"data: {json.dumps(first_chunk)}\n\n"
625
 
 
626
  async for token in generate_stream(
627
  prompt,
628
  max_tokens=max_tokens,
 
644
  }
645
  yield f"data: {json.dumps(chunk)}\n\n"
646
 
 
647
  final_chunk = {
648
  "id": request_id,
649
  "object": "chat.completion.chunk",
 
659
  }
660
  yield f"data: {json.dumps(final_chunk)}\n\n"
661
 
 
662
  if request.stream_options and request.stream_options.get("include_usage"):
663
  usage_chunk = {
664
  "id": request_id,
 
666
  "created": created_time,
667
  "model": request.model,
668
  "choices": [],
669
+ "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
 
 
 
 
670
  }
671
  yield f"data: {json.dumps(usage_chunk)}\n\n"
672
 
 
675
  return StreamingResponse(
676
  stream_generator(),
677
  media_type="text/event-stream",
678
+ headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"}
 
 
 
 
679
  )
680
 
681
+ # Non-streaming
682
+ response_text, thinking_text, input_tokens, output_tokens, stop_reason = generate_response(
683
  prompt,
684
  max_tokens=max_tokens,
685
  temperature=request.temperature or 1.0,
 
687
  stop=stop_sequences,
688
  )
689
 
690
+ # Prepend prefill to response
691
+ full_response = prefill + response_text if prefill else response_text
692
+
693
  openai_finish_reason = "stop" if stop_reason == "stop" else "length"
694
 
695
  return OpenAIChatResponse(
 
700
  choices=[
701
  OpenAIChoice(
702
  index=0,
703
+ message=OpenAIChoiceMessage(role="assistant", content=full_response),
704
  finish_reason=openai_finish_reason,
705
  logprobs=None
706
  )
 
713
  )
714
 
715
  # ============================================================================
716
+ # Anthropic Compatible Endpoints with Prefill & Thinking
717
  # ============================================================================
718
 
719
  @app.post("/v1/messages")
 
723
  x_api_key: Optional[str] = Header(None, alias="x-api-key"),
724
  anthropic_version: Optional[str] = Header(None, alias="anthropic-version"),
725
  ):
726
+ """Anthropic-compatible messages endpoint with prefill & thinking support"""
727
 
 
728
  auth_key = x_api_key or authorization
729
  if not verify_api_key(auth_key):
730
  raise HTTPException(status_code=401, detail="Invalid API key")
 
735
  content = extract_text_from_anthropic_content(m.content)
736
  messages.append({"role": m.role, "content": content})
737
 
738
+ # Check for prefill (last assistant message)
739
+ messages, prefill = extract_prefill_from_messages(messages)
740
+
741
  # Extract system prompt
742
  system_prompt = extract_system_prompt_anthropic(request.system)
743
 
744
+ prompt = format_messages_for_model(messages, system_prompt=system_prompt, prefill=prefill)
745
+
746
+ # Check thinking configuration
747
+ enable_thinking = False
748
+ thinking_budget = 512
749
+ if request.thinking:
750
+ if request.thinking.type == "enabled":
751
+ enable_thinking = True
752
+ if request.thinking.budget_tokens:
753
+ thinking_budget = request.thinking.budget_tokens
754
 
755
  request_id = f"msg_{uuid.uuid4().hex[:24]}"
756
 
757
  if request.stream:
 
758
  async def stream_generator():
759
+ input_tokens = 0
760
 
761
+ # message_start
762
  message_start = {
763
  "type": "message_start",
764
  "message": {
 
769
  "content": [],
770
  "stop_reason": None,
771
  "stop_sequence": None,
772
+ "usage": {"input_tokens": input_tokens, "output_tokens": 0}
 
 
 
773
  }
774
  }
775
  yield f"event: message_start\ndata: {json.dumps(message_start)}\n\n"
776
 
777
+ content_index = 0
778
+
779
+ # If thinking is enabled, add thinking block first (simulated)
780
+ if enable_thinking:
781
+ # thinking block start
782
+ thinking_block_start = {
783
+ "type": "content_block_start",
784
+ "index": content_index,
785
+ "content_block": {"type": "thinking", "thinking": ""}
786
+ }
787
+ yield f"event: content_block_start\ndata: {json.dumps(thinking_block_start)}\n\n"
788
+
789
+ # Simulate thinking content
790
+ thinking_text = "Analyzing the request and formulating a response..."
791
+ thinking_delta = {
792
+ "type": "content_block_delta",
793
+ "index": content_index,
794
+ "delta": {"type": "thinking_delta", "thinking": thinking_text}
795
+ }
796
+ yield f"event: content_block_delta\ndata: {json.dumps(thinking_delta)}\n\n"
797
+
798
+ thinking_block_stop = {"type": "content_block_stop", "index": content_index}
799
+ yield f"event: content_block_stop\ndata: {json.dumps(thinking_block_stop)}\n\n"
800
+
801
+ content_index += 1
802
+
803
+ # text content block start
804
  content_block_start = {
805
  "type": "content_block_start",
806
+ "index": content_index,
807
+ "content_block": {"type": "text", "text": ""}
 
 
 
808
  }
809
  yield f"event: content_block_start\ndata: {json.dumps(content_block_start)}\n\n"
810
 
811
+ # Include prefill in first delta if present
812
+ if prefill:
813
+ prefill_delta = {
814
+ "type": "content_block_delta",
815
+ "index": content_index,
816
+ "delta": {"type": "text_delta", "text": prefill}
817
+ }
818
+ yield f"event: content_block_delta\ndata: {json.dumps(prefill_delta)}\n\n"
819
+
820
+ # Stream content
821
  output_tokens = 0
822
  async for token in generate_stream(
823
  prompt,
 
829
  output_tokens += 1
830
  delta = {
831
  "type": "content_block_delta",
832
+ "index": content_index,
833
+ "delta": {"type": "text_delta", "text": token}
 
 
 
834
  }
835
  yield f"event: content_block_delta\ndata: {json.dumps(delta)}\n\n"
836
 
837
+ # content_block_stop
838
+ content_block_stop = {"type": "content_block_stop", "index": content_index}
 
 
 
839
  yield f"event: content_block_stop\ndata: {json.dumps(content_block_stop)}\n\n"
840
 
841
+ # message_delta
842
  message_delta = {
843
  "type": "message_delta",
844
+ "delta": {"stop_reason": "end_turn", "stop_sequence": None},
845
+ "usage": {"output_tokens": output_tokens}
 
 
 
 
 
846
  }
847
  yield f"event: message_delta\ndata: {json.dumps(message_delta)}\n\n"
848
 
849
+ # message_stop
850
  message_stop = {"type": "message_stop"}
851
  yield f"event: message_stop\ndata: {json.dumps(message_stop)}\n\n"
852
 
853
  return StreamingResponse(
854
  stream_generator(),
855
  media_type="text/event-stream",
856
+ headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"}
 
 
 
 
857
  )
858
 
859
  # Non-streaming response
860
+ response_text, thinking_text, input_tokens, output_tokens, stop_reason = generate_response(
861
  prompt,
862
  max_tokens=request.max_tokens,
863
  temperature=request.temperature or 1.0,
864
  top_p=request.top_p or 0.999,
865
  top_k=request.top_k,
866
  stop=request.stop_sequences,
867
+ enable_thinking=enable_thinking,
868
+ thinking_budget=thinking_budget,
869
  )
870
 
871
+ # Prepend prefill to response
872
+ full_response = prefill + response_text if prefill else response_text
873
+
874
+ # Build content blocks
875
+ content_blocks = []
876
+
877
+ # Add thinking block if enabled and we have thinking content
878
+ if enable_thinking:
879
+ if not thinking_text:
880
+ thinking_text = "Analyzing the request and formulating a response."
881
+ content_blocks.append(AnthropicResponseContent(type="thinking", thinking=thinking_text))
882
+
883
+ # Add text block
884
+ content_blocks.append(AnthropicResponseContent(type="text", text=full_response))
885
+
886
+ # Determine stop reason
887
  anthropic_stop_reason = "end_turn"
888
  stop_sequence_used = None
889
  if stop_reason == "length":
 
898
  return AnthropicResponse(
899
  id=request_id,
900
  model=request.model,
901
+ content=content_blocks,
902
  stop_reason=anthropic_stop_reason,
903
  stop_sequence=stop_sequence_used,
904
  usage=AnthropicUsage(
 
915
  async def root():
916
  return {
917
  "name": "Free Coding API",
918
+ "version": "1.1.0",
919
  "model": MODEL_ID,
920
+ "features": {
921
+ "prefill_response": "Supported - Include assistant message at end for output control",
922
+ "thinking": "Supported - Enable with thinking: {type: 'enabled'}",
923
+ "streaming": "Supported - Both OpenAI and Anthropic formats"
924
+ },
925
  "compatibility": {
926
  "openai": "v1 Chat Completions API",
927
  "anthropic": "Messages API (2023-06-01)"