Spaces:
Sleeping
Sleeping
| # func.py ── utilities for Hugging Face Space | |
| # Step1. Image to Text | |
| from typing import Union | |
| from pathlib import Path | |
| from PIL import Image | |
| from transformers import pipeline | |
| # lazy-load caption model once | |
| _captioner = None | |
| def _get_captioner(): | |
| global _captioner | |
| if _captioner is None: | |
| _captioner = pipeline( | |
| "image-to-text", | |
| model="Salesforce/blip-image-captioning-large" | |
| ) | |
| return _captioner | |
| def img2text(img: Union[Image.Image, str, Path]) -> str: | |
| """ | |
| Return a short English caption for an image. | |
| Args: | |
| img: PIL.Image, local path, or pathlib.Path. | |
| Returns: | |
| Caption string. | |
| """ | |
| # ensure PIL.Image | |
| if not isinstance(img, Image.Image): | |
| img = Image.open(img) | |
| return _get_captioner()(img)[0]["generated_text"] | |
| # Step2. Text Generation (Based on Caption) | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| _MODEL_NAME = "aspis/gpt2-genre-story-generation" | |
| _PROMPT = ( | |
| "Write a funny and warm children's story (50-100 words) for ages 3-10, " | |
| "fully and strictly based on this scene: {caption}\nStory:" | |
| ) | |
| _tokenizer, _model = None, None | |
| def _load_story_model(): | |
| """Lazy-load tokenizer / model once.""" | |
| global _tokenizer, _model | |
| if _model is None: | |
| _tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME) | |
| _model = AutoModelForCausalLM.from_pretrained(_MODEL_NAME) | |
| if torch.cuda.is_available(): | |
| _model = _model.to("cuda") | |
| return _tokenizer, _model | |
| def text2story(caption: str) -> str: | |
| """ | |
| Generate a 50-100-word children’s story from an image caption. | |
| Args: | |
| caption: Scene description string. | |
| Returns: | |
| Story text (≤100 words). | |
| """ | |
| tok, mdl = _load_story_model() | |
| prompt = _PROMPT.format(caption=caption) | |
| inputs = tok(prompt, return_tensors="pt", add_special_tokens=False) | |
| if mdl.device.type == "cuda": | |
| inputs = {k: v.to("cuda") for k, v in inputs.items()} | |
| gen_ids = mdl.generate( | |
| **inputs, | |
| max_new_tokens=150, | |
| do_sample=True, | |
| top_p=0.9, | |
| temperature=0.8, | |
| pad_token_id=tok.eos_token_id, | |
| repetition_penalty=1.1 | |
| )[0] | |
| # drop prompt, decode, keep ≤100 words, end at last period | |
| story_ids = gen_ids[inputs["input_ids"].shape[-1]:] | |
| story = tok.decode(story_ids, skip_special_tokens=True).strip() | |
| story = story[: story.rfind(".") + 1] if "." in story else story | |
| return " ".join(story.split()[:100]) | |
| # Step3. Text to Audio | |
| import numpy as np | |
| import textwrap | |
| import soundfile as sf | |
| from transformers import pipeline | |
| _TTS_MODEL = "facebook/mms-tts-eng" | |
| _tts = None | |
| def _get_tts(): | |
| """Lazy-load the TTS pipeline once.""" | |
| global _tts | |
| if _tts is None: | |
| _tts = pipeline("text-to-speech", model=_TTS_MODEL) | |
| return _tts | |
| def story2audio(story: str, wav_path: str = "story.wav") -> str: | |
| """ | |
| Synthesize speech for a story and save as WAV. | |
| Args: | |
| story: Text returned by `text2story(...)`. | |
| wav_path: Output file name. | |
| Returns: | |
| Path to the saved WAV file. | |
| """ | |
| tts = _get_tts() | |
| chunks = textwrap.wrap(story, width=200) # long text → stable chunks | |
| audio = np.concatenate([tts(c)["audio"].squeeze() | |
| for c in chunks]) | |
| sf.write(wav_path, audio, tts.model.config.sampling_rate) | |
| return wav_path | |