Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from fastapi.responses import HTMLResponse | |
| from fastapi.staticfiles import StaticFiles | |
| from pydantic import BaseModel | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| import torch | |
| import uvicorn | |
| import os | |
| app = FastAPI() | |
| # Load models and tokenizer | |
| model_name = "Qwen/Qwen2.5-0.5B-Instruct" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForCausalLM.from_pretrained(model_name) | |
| generator_pipe = pipeline("text-generation", model=model_name, tokenizer=tokenizer) | |
| summarizer_pipe = pipeline("summarization", model="facebook/bart-large-cnn") | |
| app.mount("/static", StaticFiles(directory="static"), name="static") | |
| class GenRequest(BaseModel): | |
| text: str | |
| max_new_tokens: int = 150 | |
| do_sample: bool = False | |
| mode: str = "generate" # "generate" or "summarize" | |
| async def read_root(): | |
| with open("templates/index.html", "r") as f: | |
| return f.read() | |
| def generate(req: GenRequest): | |
| if req.mode == "summarize": | |
| # Use summarization pipeline | |
| out = summarizer_pipe( | |
| req.text, | |
| max_length=req.max_new_tokens, | |
| min_length=30, | |
| do_sample=req.do_sample, | |
| ) | |
| return {"generated_text": out[0]["summary_text"]} | |
| else: | |
| # Use text generation pipeline with token-level alternatives | |
| return generate_with_alternatives(req) | |
| def generate_with_alternatives(req: GenRequest): | |
| """Generate text token-by-token with top-5 alternatives for each token""" | |
| input_text = req.text | |
| max_new_tokens = req.max_new_tokens | |
| tokens_data = [] | |
| current_text = input_text | |
| for _ in range(max_new_tokens): | |
| inputs = tokenizer(current_text, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| next_token_logits = outputs.logits[0, -1, :] | |
| # Get probabilities | |
| probs = torch.softmax(next_token_logits, dim=-1) | |
| # Get top 5 alternatives | |
| top_k = 5 | |
| top_probs, top_indices = torch.topk(probs, top_k) | |
| # Choose the greedy token (highest probability) | |
| chosen_token_id = top_indices[0].item() | |
| chosen_token = tokenizer.decode([chosen_token_id]) | |
| # Collect alternatives | |
| alternatives = [] | |
| for i in range(top_k): | |
| token_id = top_indices[i].item() | |
| token_text = tokenizer.decode([token_id]) | |
| probability = top_probs[i].item() * 100 | |
| alternatives.append({ | |
| "token": token_text, | |
| "probability": round(probability, 2) | |
| }) | |
| tokens_data.append({ | |
| "token": chosen_token, | |
| "alternatives": alternatives | |
| }) | |
| # Update current text with chosen token | |
| current_text += chosen_token | |
| # Check for end of sequence | |
| if chosen_token_id == tokenizer.eos_token_id: | |
| break | |
| # Reconstruct full text | |
| generated_text = "".join([t["token"] for t in tokens_data]) | |
| return { | |
| "generated_text": generated_text, | |
| "tokens": tokens_data | |
| } | |
| def predict_next(req: GenRequest): | |
| """Get top predictions for next word/token""" | |
| inputs = tokenizer(req.text, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| next_token_logits = outputs.logits[0, -1, :] | |
| # Get top 10 predictions | |
| top_k = 10 | |
| probs = torch.softmax(next_token_logits, dim=-1) | |
| top_probs, top_indices = torch.topk(probs, top_k) | |
| predictions = [] | |
| for prob, idx in zip(top_probs.tolist(), top_indices.tolist()): | |
| token = tokenizer.decode([idx]) | |
| predictions.append({ | |
| "token": token, | |
| "probability": round(prob * 100, 2) | |
| }) | |
| return {"predictions": predictions} | |
| if __name__ == "__main__": | |
| port = int(os.environ.get("PORT", 7860)) | |
| uvicorn.run(app, host="0.0.0.0", port=port) | |