Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- README.md +91 -8
- app.py +89 -0
- packages.txt +1 -0
- requirements.txt +11 -0
README.md
CHANGED
|
@@ -1,12 +1,95 @@
|
|
|
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji: π
|
| 4 |
-
colorFrom: green
|
| 5 |
-
colorTo: red
|
| 6 |
sdk: gradio
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
---
|
| 3 |
+
title: Scripttt
|
|
|
|
|
|
|
|
|
|
| 4 |
sdk: gradio
|
| 5 |
+
appfile: app.py
|
| 6 |
+
colorFrom: blue
|
| 7 |
+
colorTo: green
|
| 8 |
+
license: mit
|
| 9 |
+
tags:
|
| 10 |
+
- transcription
|
| 11 |
+
- diarization
|
| 12 |
+
- whisper
|
| 13 |
+
- pyannote
|
| 14 |
+
- video
|
| 15 |
+
- short-form
|
| 16 |
+
- gradio
|
| 17 |
+
- content-creation
|
| 18 |
+
pythonversion: "3.10"
|
| 19 |
---
|
| 20 |
|
| 21 |
+
# Scripttt
|
| 22 |
+
|
| 23 |
+
Scripttt is a Python web application that enables content creators to repurpose long-form video content into concise, engaging scripts for short-form platforms such as Instagram Reels and YouTube Shorts. Built with Gradio, Scripttt combines state-of-the-art transcription, speaker diarization, and script generation to deliver production-ready outputs that reflect the tone and style of the original conversation.
|
| 24 |
+
|
| 25 |
+
## Features
|
| 26 |
+
|
| 27 |
+
- **Video File Uploads Only**
|
| 28 |
+
Accepts direct uploads of video files (`.mp4`, `.mkv`, and other common formats). Audio-only files and external links are not supported.
|
| 29 |
+
|
| 30 |
+
- **Accurate Transcription**
|
| 31 |
+
Utilizes OpenAI Whisper for high-quality speech-to-text conversion.
|
| 32 |
+
|
| 33 |
+
- **Speaker Diarization**
|
| 34 |
+
Employs pyannote.audio to automatically identify and label speakers within the transcript.
|
| 35 |
+
|
| 36 |
+
- **Speaker-Tagged Transcript**
|
| 37 |
+
Generates a clean, speaker-attributed transcript of the input video.
|
| 38 |
+
|
| 39 |
+
- **Short-Form Script Generation**
|
| 40 |
+
Produces a concise, human-like script optimized for viral, short-form video content.
|
| 41 |
+
|
| 42 |
+
- **Privacy by Design**
|
| 43 |
+
All processing occurs locally; no external URLs or remote media are accepted.
|
| 44 |
+
|
| 45 |
+
## Installation
|
| 46 |
+
|
| 47 |
+
1. **Clone the Repository**
|
| 48 |
+
```
|
| 49 |
+
git clone https://github.com/your-username/scripttt.git
|
| 50 |
+
cd scripttt
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
2. **Set Up a Virtual Environment (Recommended)**
|
| 54 |
+
```
|
| 55 |
+
python -m venv venv
|
| 56 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
3. **Install Dependencies**
|
| 60 |
+
```
|
| 61 |
+
pip install -r requirements.txt
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
4. **Configure Environment Variables**
|
| 65 |
+
- Create a `.env` file in the project root.
|
| 66 |
+
- Add your Hugging Face and Google API credentials as environment variables.
|
| 67 |
+
Example:
|
| 68 |
+
```
|
| 69 |
+
HUGGINGFACE_TOKEN=your_huggingface_token
|
| 70 |
+
GOOGLE_API_KEY=your_google_api_key
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
## Usage
|
| 74 |
+
|
| 75 |
+
1. **Run the Application**
|
| 76 |
+
```
|
| 77 |
+
python app.py
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
2. **Access the Interface**
|
| 81 |
+
- Open the local URL provided by Gradio in your browser.
|
| 82 |
+
- Upload a supported video file and follow the on-screen instructions.
|
| 83 |
+
|
| 84 |
+
## Output
|
| 85 |
+
|
| 86 |
+
- **Speaker-Tagged Transcript:**
|
| 87 |
+
A clean, readable transcript with speaker labels.
|
| 88 |
+
|
| 89 |
+
- **Short-Form Script:**
|
| 90 |
+
A new, concise script based on the original video, ready for use in short-form content production.
|
| 91 |
+
|
| 92 |
+
## Limitations
|
| 93 |
+
|
| 94 |
+
- YouTube links, remote URLs, and audio-only files are **not supported**. Only direct video file uploads are accepted.
|
| 95 |
+
```
|
app.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
import os, tempfile, subprocess, gradio as gr
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
import whisper
|
| 5 |
+
import pvfalcon
|
| 6 |
+
|
| 7 |
+
# βββββββββββββββββββββββββββββββββββββββββββ
|
| 8 |
+
# 1. ENVIRONMENT
|
| 9 |
+
# βββββββββββββββββββββββββββββββββββββββββββ
|
| 10 |
+
load_dotenv()
|
| 11 |
+
FALCON_ACCESS_KEY = os.getenv("FALCON_ACCESS_KEY")
|
| 12 |
+
if not FALCON_ACCESS_KEY:
|
| 13 |
+
raise RuntimeError(
|
| 14 |
+
"Set FALCON_ACCESS_KEY in your environment or .env file "
|
| 15 |
+
"(get one free at https://console.picovoice.ai)."
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
# βββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
+
# 2. MODELS
|
| 20 |
+
# βββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
whisper_model = whisper.load_model("base") # CPU-friendly
|
| 22 |
+
falcon = pvfalcon.create(access_key=FALCON_ACCESS_KEY)
|
| 23 |
+
|
| 24 |
+
# βββββββββββββββββββββββββββββββββββββββββββ
|
| 25 |
+
# 3. CORE LOGIC
|
| 26 |
+
# βββββββββββββββββββββββββββββββββββββββββββ
|
| 27 |
+
def process_video(file, language="Auto"):
|
| 28 |
+
# 3.1 Choose language for Whisper
|
| 29 |
+
lang_code = None if language == "Auto" else language.lower()
|
| 30 |
+
|
| 31 |
+
# 3.2 Extract mono 16-kHz WAV with ffmpeg
|
| 32 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as wav:
|
| 33 |
+
wav_path = wav.name
|
| 34 |
+
subprocess.run(
|
| 35 |
+
["ffmpeg", "-y", "-i", file.name,
|
| 36 |
+
"-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", wav_path],
|
| 37 |
+
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
|
| 38 |
+
)
|
| 39 |
+
if not os.path.getsize(wav_path):
|
| 40 |
+
return "Audio extraction failed.", ""
|
| 41 |
+
|
| 42 |
+
# 3.3 Speaker diarization
|
| 43 |
+
segments = falcon.process_file(wav_path) # list[pvfalcon.Segment]
|
| 44 |
+
diarized_map, label_map, counter = [], {}, 1
|
| 45 |
+
for seg in segments:
|
| 46 |
+
tag = seg.speaker_tag
|
| 47 |
+
if tag not in label_map:
|
| 48 |
+
label_map[tag] = f"Speaker {counter}"
|
| 49 |
+
counter += 1
|
| 50 |
+
diarized_map.append(
|
| 51 |
+
dict(start=seg.start_sec, end=seg.end_sec, speaker=label_map[tag])
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# 3.4 Transcription (Whisper)
|
| 55 |
+
res = whisper_model.transcribe(wav_path, language=lang_code)
|
| 56 |
+
paragraph_transcript = res["text"] # plain paragraph
|
| 57 |
+
|
| 58 |
+
# 3.5 Merge speakers with transcription
|
| 59 |
+
speaker_lines = []
|
| 60 |
+
for s in res.get("segments", []):
|
| 61 |
+
speaker = next(
|
| 62 |
+
(m["speaker"] for m in diarized_map if m["start"] <= s["start"] <= m["end"]),
|
| 63 |
+
"Unknown"
|
| 64 |
+
)
|
| 65 |
+
speaker_lines.append(f"{speaker}: {s['text']}")
|
| 66 |
+
speaker_transcript = "\n".join(speaker_lines)
|
| 67 |
+
|
| 68 |
+
# 3.6 Return in desired order
|
| 69 |
+
return speaker_transcript, paragraph_transcript
|
| 70 |
+
|
| 71 |
+
# βββββββββββββββββββββββββββββββββββββββββββ
|
| 72 |
+
# 4. GRADIO UI
|
| 73 |
+
# βββββββββββββββββββββββββββββββββββββββββββ
|
| 74 |
+
demo = gr.Interface(
|
| 75 |
+
fn=process_video,
|
| 76 |
+
inputs=[
|
| 77 |
+
gr.File(label="Upload Video", type="filepath"),
|
| 78 |
+
gr.Dropdown(["Auto", "English", "Hindi", "Urdu"], label="Language")
|
| 79 |
+
],
|
| 80 |
+
outputs=[
|
| 81 |
+
gr.Textbox(label="Speaker-wise Transcript", show_copy_button=True),
|
| 82 |
+
gr.Textbox(label=" Transcription", show_copy_button=True)
|
| 83 |
+
],
|
| 84 |
+
title="Transcription + Speaker Segmentation",
|
| 85 |
+
description="Whisper + Picovoice Falcon running fully on CPU."
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
if __name__ == "__main__":
|
| 89 |
+
demo.launch()
|
packages.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ffmpeg
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
python-dotenv
|
| 2 |
+
requests
|
| 3 |
+
openai
|
| 4 |
+
pandas
|
| 5 |
+
git+https://github.com/openai/whisper.git
|
| 6 |
+
ffmpeg-python
|
| 7 |
+
yt-dlp
|
| 8 |
+
torch
|
| 9 |
+
torchaudio
|
| 10 |
+
gradio
|
| 11 |
+
pvfalcon # Picovoice Falcon
|