Spaces:

aeresd
/

ISOM5240_Assignment1

Sleeping

App Files Files Community

aeresd commited on May 2, 2025

Commit

1634b47

verified ·

1 Parent(s): 80ebd89

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -54

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import streamlit as st
 from transformers import pipeline, AutoTokenizer
 import torch
 import re
@@ -8,36 +8,42 @@ from PIL import Image
 from datasets import load_dataset
 import logging
-# 配置日志系统
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# ==================== 模型缓存加载 ====================
 @st.cache_resource(show_spinner=False)
 def load_models():
-    """预加载所有模型并缓存"""
-    logger.info("Loading caption model...")
-    caption_model = pipeline("image-to-text",
-                            model="Salesforce/blip-image-captioning-base",
-                            device=0 if torch.cuda.is_available() else -1)
-    logger.info("Loading story model...")
     story_model = pipeline(
-        "text-generation",
         model="Tincando/fiction_story_generator",
         device=0 if torch.cuda.is_available() else -1,
         torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
     )
-    logger.info("Loading TTS model...")
-    tts_model = pipeline("text-to-audio",
-                        model="Chan-Y/speecht5_finetuned_tr_commonvoice",
-                        device=0 if torch.cuda.is_available() else -1)
-    tts_tokenizer = AutoTokenizer.from_pretrained("Chan-Y/speecht5_finetuned_tr_commonvoice")
     return caption_model, story_model, tts_model, tts_tokenizer
-# ==================== Streamlit 界面配置 ====================
 st.set_page_config(
     page_title="🧸 AI Story Generator Pro",
     page_icon="📖",
@@ -45,65 +51,65 @@ st.set_page_config(
     initial_sidebar_state="expanded"
 )
-# ==================== 侧边栏参数设置 ====================
 with st.sidebar:
-    st.title("⚙️ 生成参数")
-    temperature = st.slider("创意度", 0.5, 1.5, 0.85, step=0.05)
-    max_length = st.slider("故事长度", 100, 500, 200)
-    story_style = st.selectbox("故事风格", ["童话", "科幻", "冒险"])
-    voice_speed = st.slider("语音速度", 0.5, 2.0, 1.0)
-# ==================== 主界面 ====================
-st.title("🖼️ AI 智能故事生成器")
-st.write("上传图片即可获得定制化故事与语音朗读")
-# ==================== 文件上传 ====================
-uploaded_file = st.file_uploader("选择图片文件", type=["jpg", "jpeg", "png"])
 if uploaded_file:
-    # ==================== 图像处理 ====================
     col1, col2 = st.columns([1, 2])
     with col1:
         image = Image.open(uploaded_file)
-        st.image(image, caption="上传图片", use_column_width=True)
-    # ==================== 生成流程 ====================
-    if st.button("开始生成", type="primary"):
         try:
             progress_bar = st.progress(0)
             status_text = st.empty()
-            # 加载模型
-            with st.spinner("🔄 正在加载模型..."):
                 caption_model, story_model, tts_model, tts_tokenizer = load_models()
                 speaker_emb = torch.tensor(
                     load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"]
                 ).unsqueeze(0)
             progress_bar.progress(20)
-            # 图像描述生成
-            with st.spinner("📷 正在分析图片内容..."):
                 caption_result = caption_model(image)
                 caption = caption_result[0]['generated_text']
-                progress_bar.progress(40)
-            # 故事生成
-            with st.spinner("✍️ 正在创作精彩故事..."):
-                prompt = f"以{story_style}风格创作儿童故事，主题：{caption}"
                 story = story_model(
                     prompt,
                     temperature=temperature,
                     max_length=max_length,
                     do_sample=True
                 )[0]['generated_text']
-                story = re.sub(r'[^.!?]+$', '', story)  # 确保完整结尾
-                progress_bar.progress(70)
-            # 语音合成
-            with st.spinner("🔊 正在生成语音..."):
                 chunks = re.split(r'(?<=[.!?]) +', story)
                 audio_arrays = []
                 for chunk in chunks:
                     inputs = tts_tokenizer(chunk, return_tensors="pt")
                     speech = tts_model.generate(
@@ -114,33 +120,32 @@ if uploaded_file:
                         }
                     )
                     audio_arrays.append(speech.numpy())
                 combined = np.concatenate(audio_arrays)
                 sf.write("output.wav", combined, samplerate=16000)
-                progress_bar.progress(100)
-            # ==================== 结果展示 ====================
             with col2:
-                st.subheader("📖 生成故事")
                 st.success(story)
-                st.subheader("🔊 语音朗读")
                 st.audio("output.wav", format="audio/wav")
-                # 下载功能
                 st.download_button(
-                    label="下载故事文本",
                     data=story,
                     file_name="generated_story.txt",
                     mime="text/plain"
                 )
                 st.download_button(
-                    label="下载语音文件",
                     data=open("output.wav", "rb"),
                     file_name="story_audio.wav",
                     mime="audio/wav"
                 )
         except Exception as e:
-            st.error(f"生成失败：{str(e)}")
-            st.button("重试", on_click=st.cache_resource.clear)

+import streamlit as st
 from transformers import pipeline, AutoTokenizer
 import torch
 import re
 from datasets import load_dataset
 import logging
+# Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# ==================== Model loading with caching ====================
 @st.cache_resource(show_spinner=False)
 def load_models():
+    """Pre-load and cache all models"""
+    logger.info("Loading image captioning model...")
+    caption_model = pipeline(
+        task="image-to-text",
+        model="Salesforce/blip-image-captioning-base",
+        device=0 if torch.cuda.is_available() else -1
+    )
+    logger.info("Loading story generation model...")
     story_model = pipeline(
+        task="text-generation",
         model="Tincando/fiction_story_generator",
         device=0 if torch.cuda.is_available() else -1,
         torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
     )
+    logger.info("Loading text-to-speech model...")
+    tts_model = pipeline(
+        task="text-to-audio",
+        model="Chan-Y/speecht5_finetuned_tr_commonvoice",
+        device=0 if torch.cuda.is_available() else -1
+    )
+    tts_tokenizer = AutoTokenizer.from_pretrained(
+        "Chan-Y/speecht5_finetuned_tr_commonvoice"
+    )
     return caption_model, story_model, tts_model, tts_tokenizer
+# ==================== Streamlit page configuration ====================
 st.set_page_config(
     page_title="🧸 AI Story Generator Pro",
     page_icon="📖",
     initial_sidebar_state="expanded"
 )
+# ==================== Sidebar settings ====================
 with st.sidebar:
+    st.title("⚙️ Generation Settings")
+    temperature = st.slider("Creativity", 0.5, 1.5, 0.85, step=0.05)
+    max_length = st.slider("Story Length", 100, 500, 200)
+    story_style = st.selectbox("Story Style", ["Fairy Tale", "Sci-Fi", "Adventure"])
+    voice_speed = st.slider("Voice Speed", 0.5, 2.0, 1.0)
+# ==================== Main interface ====================
+st.title("🖼️ AI Story Generator")
+st.write("Upload an image to get a customized story with audio narration.")
+# ==================== File upload ====================
+uploaded_file = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png"])
 if uploaded_file:
+    # ==================== Image display ====================
     col1, col2 = st.columns([1, 2])
     with col1:
         image = Image.open(uploaded_file)
+        st.image(image, caption="Uploaded Image", use_column_width=True)
+    # ==================== Generation process ====================
+    if st.button("Generate Story", type="primary"):
         try:
             progress_bar = st.progress(0)
             status_text = st.empty()
+            # Load models
+            with st.spinner("🔄 Loading models..."):
                 caption_model, story_model, tts_model, tts_tokenizer = load_models()
                 speaker_emb = torch.tensor(
                     load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"]
                 ).unsqueeze(0)
             progress_bar.progress(20)
+            # Generate image caption
+            with st.spinner("📷 Analyzing image content..."):
                 caption_result = caption_model(image)
                 caption = caption_result[0]['generated_text']
+            progress_bar.progress(40)
+            # Generate story
+            with st.spinner("✍️ Writing the story..."):
+                prompt = f"Write a children's story in {story_style} style about: {caption}"
                 story = story_model(
                     prompt,
                     temperature=temperature,
                     max_length=max_length,
                     do_sample=True
                 )[0]['generated_text']
+                # Ensure story ends with punctuation
+                story = re.sub(r'[^.!?]+$', '', story)
+            progress_bar.progress(70)
+            # Text-to-speech synthesis
+            with st.spinner("🔊 Generating audio..."):
                 chunks = re.split(r'(?<=[.!?]) +', story)
                 audio_arrays = []
                 for chunk in chunks:
                     inputs = tts_tokenizer(chunk, return_tensors="pt")
                     speech = tts_model.generate(
                         }
                     )
                     audio_arrays.append(speech.numpy())
                 combined = np.concatenate(audio_arrays)
                 sf.write("output.wav", combined, samplerate=16000)
+            progress_bar.progress(100)
+            # ==================== Display results ====================
             with col2:
+                st.subheader("📖 Generated Story")
                 st.success(story)
+                st.subheader("🔊 Audio Narration")
                 st.audio("output.wav", format="audio/wav")
+                # Download buttons
                 st.download_button(
+                    label="Download Story Text",
                     data=story,
                     file_name="generated_story.txt",
                     mime="text/plain"
                 )
                 st.download_button(
+                    label="Download Audio File",
                     data=open("output.wav", "rb"),
                     file_name="story_audio.wav",
                     mime="audio/wav"
                 )
         except Exception as e:
+            st.error(f"Generation failed: {str(e)}")
+            st.button("Retry", on_click=st.cache_resource.clear)