Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -16,23 +16,13 @@ import torch
|
|
| 16 |
# ======================================
|
| 17 |
|
| 18 |
# Initialize image captioning pipeline with pretrained model
|
| 19 |
-
# Model source: Hugging Face Model Hub
|
| 20 |
_image_caption_pipeline = pipeline(
|
| 21 |
task="image-to-text",
|
| 22 |
model="noamrot/FuseCap_Image_Captioning"
|
| 23 |
)
|
| 24 |
|
| 25 |
# Global model configuration constants
|
| 26 |
-
|
| 27 |
-
_THINKING_TOKEN_ID = 151668 # Special token marking thinking/content separation
|
| 28 |
-
|
| 29 |
-
# Initialize model components once
|
| 30 |
-
_tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
|
| 31 |
-
_model = AutoModelForCausalLM.from_pretrained(
|
| 32 |
-
_MODEL_NAME,
|
| 33 |
-
torch_dtype="auto",
|
| 34 |
-
device_map="auto"
|
| 35 |
-
)
|
| 36 |
|
| 37 |
# Initialize TTS components once to avoid reloading
|
| 38 |
_SPEECH_PIPELINE = pipeline("text-to-speech", model="microsoft/speecht5_tts")
|
|
@@ -95,76 +85,19 @@ def generate_story_content(system_prompt: str, user_prompt: str) -> str:
|
|
| 95 |
{"role": "system", "content": system_prompt},
|
| 96 |
{"role": "user", "content": user_prompt}
|
| 97 |
]
|
| 98 |
-
|
| 99 |
-
#
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
enable_thinking=False
|
| 105 |
-
)
|
| 106 |
-
|
| 107 |
-
# Tokenize and prepare model inputs
|
| 108 |
-
model_inputs = _tokenizer(
|
| 109 |
-
[formatted_input],
|
| 110 |
-
return_tensors="pt"
|
| 111 |
-
).to(_model.device)
|
| 112 |
-
|
| 113 |
-
# Generate text completion
|
| 114 |
-
generated_sequences = _model.generate(
|
| 115 |
-
**model_inputs,
|
| 116 |
-
max_new_tokens=150
|
| 117 |
-
)
|
| 118 |
|
| 119 |
# Process and clean output
|
| 120 |
-
return
|
| 121 |
-
generated_sequences,
|
| 122 |
-
model_inputs.input_ids
|
| 123 |
-
)
|
| 124 |
|
| 125 |
except Exception as error:
|
| 126 |
raise RuntimeError(f"Story generation failed: {str(error)}") from error
|
| 127 |
|
| 128 |
-
def _process_generated_output(generated_sequences: list, input_ids: list) -> str:
|
| 129 |
-
"""
|
| 130 |
-
Processes raw model output to extract final content.
|
| 131 |
-
|
| 132 |
-
Args:
|
| 133 |
-
generated_sequences: Raw output sequences from model generation
|
| 134 |
-
input_ids: Original input token IDs used for generation
|
| 135 |
-
|
| 136 |
-
Returns:
|
| 137 |
-
Cleaned final content text
|
| 138 |
-
"""
|
| 139 |
-
# Extract new tokens excluding original prompt
|
| 140 |
-
new_tokens = generated_sequences[0][len(input_ids[0]):].tolist()
|
| 141 |
-
|
| 142 |
-
# Find separation point between thinking and final content
|
| 143 |
-
separation_index = _find_thinking_separation(new_tokens)
|
| 144 |
-
|
| 145 |
-
# Decode and clean final content
|
| 146 |
-
return _tokenizer.decode(
|
| 147 |
-
new_tokens[separation_index:],
|
| 148 |
-
skip_special_tokens=True
|
| 149 |
-
).strip("\n")
|
| 150 |
-
|
| 151 |
-
def _find_thinking_separation(token_sequence: list) -> int:
|
| 152 |
-
"""
|
| 153 |
-
Locates the boundary between thinking process and final content.
|
| 154 |
-
|
| 155 |
-
Args:
|
| 156 |
-
token_sequence: List of generated token IDs
|
| 157 |
-
|
| 158 |
-
Returns:
|
| 159 |
-
Index position marking the start of final content
|
| 160 |
-
"""
|
| 161 |
-
try:
|
| 162 |
-
# Search from end for separation token
|
| 163 |
-
reverse_position = token_sequence[::-1].index(_THINKING_TOKEN_ID)
|
| 164 |
-
return len(token_sequence) - reverse_position
|
| 165 |
-
except ValueError:
|
| 166 |
-
return 0 # Return start if token not found
|
| 167 |
-
|
| 168 |
def generate_audio_from_story(story_text: str, output_path: str = "output.wav") -> str:
|
| 169 |
"""
|
| 170 |
Convert text story to speech audio file using text-to-speech synthesis.
|
|
@@ -238,7 +171,6 @@ st.markdown("""
|
|
| 238 |
margin: 20px 0;
|
| 239 |
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
|
| 240 |
}
|
| 241 |
-
|
| 242 |
/* Image caption styling */
|
| 243 |
.image-caption {
|
| 244 |
border-left: 4px solid #4CAF50;
|
|
|
|
| 16 |
# ======================================
|
| 17 |
|
| 18 |
# Initialize image captioning pipeline with pretrained model
|
|
|
|
| 19 |
_image_caption_pipeline = pipeline(
|
| 20 |
task="image-to-text",
|
| 21 |
model="noamrot/FuseCap_Image_Captioning"
|
| 22 |
)
|
| 23 |
|
| 24 |
# Global model configuration constants
|
| 25 |
+
_text_generation_pipeline = pipeline("text-generation", model="Qwen/Qwen1.5-0.5B",max_new_tokens=100)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
# Initialize TTS components once to avoid reloading
|
| 28 |
_SPEECH_PIPELINE = pipeline("text-to-speech", model="microsoft/speecht5_tts")
|
|
|
|
| 85 |
{"role": "system", "content": system_prompt},
|
| 86 |
{"role": "user", "content": user_prompt}
|
| 87 |
]
|
| 88 |
+
|
| 89 |
+
# Generate the story
|
| 90 |
+
story=_text_generation_pipeline(conversation_history)
|
| 91 |
+
|
| 92 |
+
# Extract the stroy result
|
| 93 |
+
stroy_result=story[0]["generated_text"][2]["content"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
# Process and clean output
|
| 96 |
+
return stroy_result
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
except Exception as error:
|
| 99 |
raise RuntimeError(f"Story generation failed: {str(error)}") from error
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
def generate_audio_from_story(story_text: str, output_path: str = "output.wav") -> str:
|
| 102 |
"""
|
| 103 |
Convert text story to speech audio file using text-to-speech synthesis.
|
|
|
|
| 171 |
margin: 20px 0;
|
| 172 |
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
|
| 173 |
}
|
|
|
|
| 174 |
/* Image caption styling */
|
| 175 |
.image-caption {
|
| 176 |
border-left: 4px solid #4CAF50;
|