Spaces:

Luigi
/

tiny-scribe

Running

App Files Files Community

Luigi commited on Jan 29

Commit

d4fd1c3

1 Parent(s): 7ac9e1f

refactor: fix streaming summary bug and simplify to streaming-only output

Browse files

Files changed (2) hide show

README.md +37 -0
summarize_transcript.py +81 -26

README.md ADDED Viewed

	@@ -0,0 +1,37 @@

+# Transcript Summarization Script
+This script provides functionality to summarize transcripts using the Falcon-H1-Tiny-Multilingual model with SYCL acceleration. It focuses on live streaming summarization for immediate feedback.
+## Key Features
+### 1. State Isolation
+Each summarization call ensures a clean state by calling `llm.reset()` after each operation. This prevents any carryover from previous summarizations, ensuring consistent and independent results.
+### 2. Live Streaming Summary
+The script implements a live streaming summary feature that generates the summary in real-time, displaying tokens as they are produced by the model. This provides immediate feedback.
+### 3. Multi-language Support
+The script supports both English and Traditional Chinese (zh-TW) summarization.
+## Functions
+### `stream_summarize_transcript(llm, transcript, language='zh-TW')`
+Performs live streaming summary by generating the summary in real-time and displaying tokens as they are produced by the model.
+## Improvements Made
+1. **Streaming-Only Workflow**: Simplified the script to focus on real-time streaming for all summaries.
+2. **State Isolation**: Added `llm.reset()` calls after each summarization to ensure clean state between operations.
+3. **True Live Streaming**: Implemented real-time token streaming using `create_chat_completion` for immediate output display.
+4. **Reduced Verbosity**: Set `verbose=False` for cleaner output during model operations.
+## Usage
+```bash
+python summarize_transcript.py
+```
+The script will:
+1. Load the model.
+2. Generate Chinese and English summaries using live streaming.
+3. Save the summaries to `chinese_summary.txt` and `english_summary.txt`.

summarize_transcript.py CHANGED Viewed

@@ -9,7 +9,7 @@ from huggingface_hub import hf_hub_download
 def load_model():
     """Load the model from Hugging Face Hub."""
     # Initialize the model with SYCL support
     llm = Llama.from_pretrained(
         repo_id="Luigi/Falcon-H1-Tiny-Multilingual-100M-Instruct-GGUF",
@@ -17,13 +17,13 @@ def load_model():
         n_gpu_layers=-1,  # Use all layers on GPU
         seed=1337,
         n_ctx=32768,       # Context size
-        verbose=True,
         n_batch=1024,
         n_ubatch=512,
         v_type=2,
         k_type=2
     )
     return llm
 def read_transcript(file_path):
@@ -64,46 +64,101 @@ def summarize_transcript(llm, transcript, language='zh-TW'):
         stop=["<|end_of_text|>", "<|eot_id|>", "<|eom_id|>"]
     )
     llm.reset()
     return output['choices'][0]['message']['content'].strip()
 def main():
     print("Loading Falcon-H1-Tiny-Multilingual model with SYCL acceleration...")
     # Load the model
     llm = load_model()
     # Read the transcript
     transcript_path = "/home/luigi/tiny-scribe/transcripts/short.txt"
     transcript = read_transcript(transcript_path)
-    print("\nOriginal Transcript:")
     print(transcript[:500] + "..." if len(transcript) > 500 else transcript)
-    # Summarize in Chinese (zh-TW)
-    print("\nGenerating Chinese (zh-TW) summary...")
-    chinese_summary = summarize_transcript(llm, transcript, language='zh-TW')
-    print("Chinese Summary:")
-    print(chinese_summary)
-    # Summarize in English
-    print("\nGenerating English summary...")
-    english_summary = summarize_transcript(llm, transcript, language='en')
-    print("English Summary:")
-    print(english_summary)
     # Save summaries to files
     with open("/home/luigi/tiny-scribe/chinese_summary.txt", 'w', encoding='utf-8') as f:
         f.write(chinese_summary)
     with open("/home/luigi/tiny-scribe/english_summary.txt", 'w', encoding='utf-8') as f:
         f.write(english_summary)
-    print("\nSummaries saved to files.")
     # Clean up
     del llm
 if __name__ == "__main__":
-    main()

 def load_model():
     """Load the model from Hugging Face Hub."""
     # Initialize the model with SYCL support
     llm = Llama.from_pretrained(
         repo_id="Luigi/Falcon-H1-Tiny-Multilingual-100M-Instruct-GGUF",
         n_gpu_layers=-1,  # Use all layers on GPU
         seed=1337,
         n_ctx=32768,       # Context size
+        verbose=False,     # Reduced verbosity for cleaner output
         n_batch=1024,
         n_ubatch=512,
         v_type=2,
         k_type=2
     )
     return llm
 def read_transcript(file_path):
         stop=["<|end_of_text|>", "<|eot_id|>", "<|eom_id|>"]
     )
+    # Reset the model state to ensure clean state for next call
     llm.reset()
     return output['choices'][0]['message']['content'].strip()
+def stream_summarize_transcript(llm, transcript, language='zh-TW'):
+    """
+    Perform live streaming summary by getting real-time token output from the model.
+    Args:
+        llm: The loaded language model
+        transcript: The full transcript to summarize
+        language: Language for the summary ('en' or 'zh-TW')
+    """
+    # Truncate the transcript to fit within the context window
+    max_transcript_length = 1000  # Leave room for prompt and response
+    if len(transcript) > max_transcript_length:
+        transcript = transcript[:max_transcript_length]
+        print(f"Transcript truncated to {max_transcript_length} characters to fit context window.")
+    # Use the model's chat format based on its template
+    if language == 'en':
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant that summarizes transcripts."},
+            {"role": "user", "content": f"Please summarize the following transcript:\n\n{transcript}"}
+        ]
+    else:  # Default to zh-TW
+        messages = [
+            {"role": "system", "content": "你是一個有助的助手，負責總結轉錄內容。"},
+            {"role": "user", "content": f"請總結以下內容：\n\n{transcript}"}
+        ]
+    # Generate the summary using streaming completion
+    print(f"\nStreaming {language} summary:")
+    print("="*50)
+    full_response = ""
+    stream = llm.create_chat_completion(
+        messages=messages,
+        max_tokens=512,
+        temperature=0.3,
+        top_p=0.9,
+        repeat_penalty=1.1,
+        stop=["<|end_of_text|>", "<|eot_id|>", "<|eom_id|>"],
+        stream=True
+    )
+    for chunk in stream:
+        if 'choices' in chunk and len(chunk['choices']) > 0:
+            delta = chunk['choices'][0].get('delta', {})
+            content = delta.get('content', '')
+            if content:
+                print(content, end='', flush=True)
+                full_response += content
+    print("\n" + "="*50)
+    # Reset the model state to ensure clean state for next call
+    llm.reset()
+    return full_response.strip()
 def main():
     print("Loading Falcon-H1-Tiny-Multilingual model with SYCL acceleration...")
     # Load the model
     llm = load_model()
     # Read the transcript
     transcript_path = "/home/luigi/tiny-scribe/transcripts/short.txt"
     transcript = read_transcript(transcript_path)
+    print("\nOriginal Transcript (Preview):")
     print(transcript[:500] + "..." if len(transcript) > 500 else transcript)
+    # Summarize in Chinese (zh-TW) with streaming
+    chinese_summary = stream_summarize_transcript(llm, transcript, language='zh-TW')
+    # Summarize in English with streaming
+    english_summary = stream_summarize_transcript(llm, transcript, language='en')
     # Save summaries to files
     with open("/home/luigi/tiny-scribe/chinese_summary.txt", 'w', encoding='utf-8') as f:
         f.write(chinese_summary)
     with open("/home/luigi/tiny-scribe/english_summary.txt", 'w', encoding='utf-8') as f:
         f.write(english_summary)
+    print("\nSummaries saved to chinese_summary.txt and english_summary.txt.")
     # Clean up
     del llm
 if __name__ == "__main__":
+    main()