Spaces:

aeresd
/

ISOM5240_Assignment1

Sleeping

App Files Files Community

aeresd commited on May 2, 2025

Commit

aa2ae39

verified ·

1 Parent(s): a274a68

Create app.py

Browse files

Files changed (1) hide show

app.py +146 -0

app.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import streamlit as st
+from transformers import pipeline, AutoTokenizer
+import torch
+import re
+import numpy as np
+import soundfile as sf
+from PIL import Image
+from datasets import load_dataset
+import logging
+# 配置日志系统
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# ==================== 模型缓存加载 ====================
+@st.cache_resource(show_spinner=False)
+def load_models():
+    """预加载所有模型并缓存"""
+    logger.info("Loading caption model...")
+    caption_model = pipeline("image-to-text",
+                            model="Salesforce/blip-image-captioning-base",
+                            device=0 if torch.cuda.is_available() else -1)
+    logger.info("Loading story model...")
+    story_model = pipeline(
+        "text-generation",
+        model="Tincando/fiction_story_generator",
+        device=0 if torch.cuda.is_available() else -1,
+        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    )
+    logger.info("Loading TTS model...")
+    tts_model = pipeline("text-to-audio",
+                        model="Chan-Y/speecht5_finetuned_tr_commonvoice",
+                        device=0 if torch.cuda.is_available() else -1)
+    tts_tokenizer = AutoTokenizer.from_pretrained("Chan-Y/speecht5_finetuned_tr_commonvoice")
+    return caption_model, story_model, tts_model, tts_tokenizer
+# ==================== Streamlit 界面配置 ====================
+st.set_page_config(
+    page_title="🧸 AI Story Generator Pro",
+    page_icon="📖",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# ==================== 侧边栏参数设置 ====================
+with st.sidebar:
+    st.title("⚙️ 生成参数")
+    temperature = st.slider("创意度", 0.5, 1.5, 0.85, step=0.05)
+    max_length = st.slider("故事长度", 100, 500, 200)
+    story_style = st.selectbox("故事风格", ["童话", "科幻", "冒险"])
+    voice_speed = st.slider("语音速度", 0.5, 2.0, 1.0)
+# ==================== 主界面 ====================
+st.title("🖼️ AI 智能故事生成器")
+st.write("上传图片即可获得定制化故事与语音朗读")
+# ==================== 文件上传 ====================
+uploaded_file = st.file_uploader("选择图片文件", type=["jpg", "jpeg", "png"])
+if uploaded_file:
+    # ==================== 图像处理 ====================
+    col1, col2 = st.columns([1, 2])
+    with col1:
+        image = Image.open(uploaded_file)
+        st.image(image, caption="上传图片", use_column_width=True)
+    # ==================== 生成流程 ====================
+    if st.button("开始生成", type="primary"):
+        try:
+            progress_bar = st.progress(0)
+            status_text = st.empty()
+            # 加载模型
+            with st.spinner("🔄 正在加载模型..."):
+                caption_model, story_model, tts_model, tts_tokenizer = load_models()
+                speaker_emb = torch.tensor(
+                    load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"]
+                ).unsqueeze(0)
+            progress_bar.progress(20)
+            # 图像描述生成
+            with st.spinner("📷 正在分析图片内容..."):
+                caption_result = caption_model(image)
+                caption = caption_result[0]['generated_text']
+                progress_bar.progress(40)
+            # 故事生成
+            with st.spinner("✍️ 正在创作精彩故事..."):
+                prompt = f"以{story_style}风格创作儿童故事，主题：{caption}"
+                story = story_model(
+                    prompt,
+                    temperature=temperature,
+                    max_length=max_length,
+                    do_sample=True
+                )[0]['generated_text']
+                story = re.sub(r'[^.!?]+$', '', story)  # 确保完整结尾
+                progress_bar.progress(70)
+            # 语音合成
+            with st.spinner("🔊 正在生成语音..."):
+                chunks = re.split(r'(?<=[.!?]) +', story)
+                audio_arrays = []
+                for chunk in chunks:
+                    inputs = tts_tokenizer(chunk, return_tensors="pt")
+                    speech = tts_model.generate(
+                        inputs["input_ids"],
+                        forward_params={
+                            "speaker_embeddings": speaker_emb,
+                            "speed": voice_speed
+                        }
+                    )
+                    audio_arrays.append(speech.numpy())
+                combined = np.concatenate(audio_arrays)
+                sf.write("output.wav", combined, samplerate=16000)
+                progress_bar.progress(100)
+            # ==================== 结果展示 ====================
+            with col2:
+                st.subheader("📖 生成故事")
+                st.success(story)
+                st.subheader("🔊 语音朗读")
+                st.audio("output.wav", format="audio/wav")
+                # 下载功能
+                st.download_button(
+                    label="下载故事文本",
+                    data=story,
+                    file_name="generated_story.txt",
+                    mime="text/plain"
+                )
+                st.download_button(
+                    label="下载语音文件",
+                    data=open("output.wav", "rb"),
+                    file_name="story_audio.wav",
+                    mime="audio/wav"
+                )
+        except Exception as e:
+            st.error(f"生成失败：{str(e)}")
+            st.button("重试", on_click=st.cache_resource.clear)