Spaces:
Sleeping
Sleeping
Commit ·
0890748
1
Parent(s): d392f23
fix english caption
Browse files- core/analyze.py +136 -38
- core/config.py +187 -223
- core/free_translator.py +8 -16
- core/stt.py +31 -78
- core/subtitle_manager.py +183 -149
- processor.py +104 -56
- requirements.txt +2 -2
core/analyze.py
CHANGED
|
@@ -1,10 +1,16 @@
|
|
| 1 |
import os
|
| 2 |
import time
|
|
|
|
|
|
|
| 3 |
from openai import OpenAI
|
| 4 |
from dotenv import load_dotenv
|
| 5 |
|
| 6 |
load_dotenv()
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
# Configure OpenAI Client
|
| 9 |
api_key = os.getenv("OPENROUTER_API_KEY")
|
| 10 |
client = OpenAI(
|
|
@@ -12,50 +18,57 @@ client = OpenAI(
|
|
| 12 |
api_key=api_key
|
| 13 |
)
|
| 14 |
|
| 15 |
-
def analyze_transcript_gemini(transcript):
|
| 16 |
-
"""Analyze transcript using OpenRouter (DeepSeek) via Env Key."""
|
| 17 |
-
|
| 18 |
-
prompt = f"""
|
| 19 |
-
You are an expert video editor and viral content strategist. Your task is to identify the most engaging segments from the provided transcript that are suitable for short-form video platforms like TikTok, Reels, and YouTube Shorts.
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
{{
|
| 26 |
"segments": [
|
| 27 |
{{
|
| 28 |
"start_time": <float, start time in seconds>,
|
| 29 |
"end_time": <float, end time in seconds>,
|
| 30 |
-
"duration": <float, duration in seconds>,
|
| 31 |
"description": "<string, brief summary of the clip content 10 words max>",
|
| 32 |
"viral_score": <float, score from 0-10 indicating viral potential>,
|
| 33 |
"reason": "<string, explanation of why this segment is engaging>"
|
| 34 |
}}
|
| 35 |
]
|
| 36 |
}}
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
IMPORTANT:
|
| 45 |
-
- Return valid JSON only.
|
| 46 |
-
- If no suitable segments are found, return {{ "segments": [] }}.
|
| 47 |
-
- Ensure all strings are properly escaped.
|
| 48 |
-
|
| 49 |
Transcript to Analyze:
|
| 50 |
{transcript}
|
| 51 |
"""
|
| 52 |
|
| 53 |
max_retries = 3
|
| 54 |
base_delay = 5
|
|
|
|
| 55 |
|
| 56 |
for attempt in range(max_retries):
|
| 57 |
try:
|
| 58 |
-
# Simple direct request
|
| 59 |
response = client.chat.completions.create(
|
| 60 |
model="deepseek/deepseek-chat",
|
| 61 |
messages=[
|
|
@@ -68,7 +81,7 @@ def analyze_transcript_gemini(transcript):
|
|
| 68 |
},
|
| 69 |
temperature=0.7,
|
| 70 |
)
|
| 71 |
-
|
| 72 |
content = response.choices[0].message.content
|
| 73 |
print(f"🤖 AI Raw Response (First 500 chars): {content[:500]}...")
|
| 74 |
|
|
@@ -77,18 +90,14 @@ def analyze_transcript_gemini(transcript):
|
|
| 77 |
content = content.split("```json")[1].split("```")[0].strip()
|
| 78 |
elif "```" in content:
|
| 79 |
content = content.split("```")[1].split("```")[0].strip()
|
| 80 |
-
|
| 81 |
-
#
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
segments_count = len(data.get("segments", []))
|
| 86 |
-
print(f"🤖 AI Response parsed successfully: Found {segments_count} segments.")
|
| 87 |
-
except Exception as e:
|
| 88 |
-
print(f"⚠️ Failed to parse AI response for logging: {e}")
|
| 89 |
|
| 90 |
return {"content": content}
|
| 91 |
-
|
| 92 |
except Exception as e:
|
| 93 |
print(f"❌ Error in OpenRouter analysis: {e}")
|
| 94 |
if attempt < max_retries - 1:
|
|
@@ -102,10 +111,99 @@ def analyze_transcript_gemini(transcript):
|
|
| 102 |
return {"content": '{"segments": []}'}
|
| 103 |
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
-
|
|
|
|
| 107 |
if __name__ == "__main__":
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import time
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
from openai import OpenAI
|
| 6 |
from dotenv import load_dotenv
|
| 7 |
|
| 8 |
load_dotenv()
|
| 9 |
|
| 10 |
+
# Setup Logger
|
| 11 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
# Configure OpenAI Client
|
| 15 |
api_key = os.getenv("OPENROUTER_API_KEY")
|
| 16 |
client = OpenAI(
|
|
|
|
| 18 |
api_key=api_key
|
| 19 |
)
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
def analyze_transcript(transcript):
|
| 23 |
+
"""Analyze transcript using OpenRouter via Env Key."""
|
| 24 |
+
|
| 25 |
+
prompt = f"""
|
| 26 |
+
You are an expert video editor and viral content strategist.
|
| 27 |
+
Your task is to identify the most engaging segments from the provided transcript
|
| 28 |
+
that are suitable for short-form video platforms like TikTok, Reels, and YouTube Shorts.
|
| 29 |
|
| 30 |
+
**STRICT REQUIREMENTS:**
|
| 31 |
+
1. **Duration**: duration MUST be between 60 seconds and 180 seconds (3 minutes)
|
| 32 |
+
2. **Context Preservation**: Each segment must be a complete thought - no abrupt cuts
|
| 33 |
+
3. **Sentence Boundaries**: Start at the beginning of a sentence, end at a natural conclusion
|
| 34 |
+
4. **Meaning Coherence**: The clip must make sense on its own without requiring prior context
|
| 35 |
+
|
| 36 |
+
**SELECTION CRITERIA:**
|
| 37 |
+
- Strong hooks that grab attention
|
| 38 |
+
- Emotional moments, humor, or surprising revelations
|
| 39 |
+
- Clear beginning, middle, and satisfying conclusion
|
| 40 |
+
- High shareability potential
|
| 41 |
+
|
| 42 |
+
**JSON OUTPUT FORMAT (REQUIRED):**
|
| 43 |
{{
|
| 44 |
"segments": [
|
| 45 |
{{
|
| 46 |
"start_time": <float, start time in seconds>,
|
| 47 |
"end_time": <float, end time in seconds>,
|
| 48 |
+
"duration": <float, duration in seconds (30-180)>,
|
| 49 |
"description": "<string, brief summary of the clip content 10 words max>",
|
| 50 |
"viral_score": <float, score from 0-10 indicating viral potential>,
|
| 51 |
"reason": "<string, explanation of why this segment is engaging>"
|
| 52 |
}}
|
| 53 |
]
|
| 54 |
}}
|
| 55 |
+
|
| 56 |
+
**IMPORTANT NOTES:**
|
| 57 |
+
- If no suitable segments are found, return {{ "segments": [] }}
|
| 58 |
+
- Ensure all strings are properly escaped
|
| 59 |
+
- Each segment must be a complete, coherent thought
|
| 60 |
+
- Avoid cutting mid-sentence or mid-thought
|
| 61 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
Transcript to Analyze:
|
| 63 |
{transcript}
|
| 64 |
"""
|
| 65 |
|
| 66 |
max_retries = 3
|
| 67 |
base_delay = 5
|
| 68 |
+
content = None # FIX: initialize content to avoid UnboundLocalError
|
| 69 |
|
| 70 |
for attempt in range(max_retries):
|
| 71 |
try:
|
|
|
|
| 72 |
response = client.chat.completions.create(
|
| 73 |
model="deepseek/deepseek-chat",
|
| 74 |
messages=[
|
|
|
|
| 81 |
},
|
| 82 |
temperature=0.7,
|
| 83 |
)
|
| 84 |
+
|
| 85 |
content = response.choices[0].message.content
|
| 86 |
print(f"🤖 AI Raw Response (First 500 chars): {content[:500]}...")
|
| 87 |
|
|
|
|
| 90 |
content = content.split("```json")[1].split("```")[0].strip()
|
| 91 |
elif "```" in content:
|
| 92 |
content = content.split("```")[1].split("```")[0].strip()
|
| 93 |
+
|
| 94 |
+
# Validate JSON and log segment count
|
| 95 |
+
data = json.loads(content)
|
| 96 |
+
segments_count = len(data.get("segments", []))
|
| 97 |
+
print(f"🤖 AI Response parsed successfully: Found {segments_count} segments.")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
return {"content": content}
|
| 100 |
+
|
| 101 |
except Exception as e:
|
| 102 |
print(f"❌ Error in OpenRouter analysis: {e}")
|
| 103 |
if attempt < max_retries - 1:
|
|
|
|
| 111 |
return {"content": '{"segments": []}'}
|
| 112 |
|
| 113 |
|
| 114 |
+
# Smart chunking system for long transcripts
|
| 115 |
+
def smart_chunk_transcript(transcript, max_tokens=4000):
|
| 116 |
+
"""
|
| 117 |
+
Split transcript into coherent chunks at sentence boundaries
|
| 118 |
+
while preserving context and meaning.
|
| 119 |
+
"""
|
| 120 |
+
import json
|
| 121 |
+
# Simple sentence-based chunking
|
| 122 |
+
sentences = transcript.replace('\n', ' ').split('. ')
|
| 123 |
+
chunks = []
|
| 124 |
+
current_chunk = []
|
| 125 |
+
current_length = 0
|
| 126 |
+
|
| 127 |
+
for sentence in sentences:
|
| 128 |
+
sentence_length = len(sentence.split())
|
| 129 |
+
|
| 130 |
+
if current_length + sentence_length > max_tokens and current_chunk:
|
| 131 |
+
chunk_text = '. '.join(current_chunk) + '.'
|
| 132 |
+
chunks.append(chunk_text.strip())
|
| 133 |
+
current_chunk = [sentence]
|
| 134 |
+
current_length = sentence_length
|
| 135 |
+
else:
|
| 136 |
+
current_chunk.append(sentence)
|
| 137 |
+
current_length += sentence_length
|
| 138 |
+
|
| 139 |
+
if current_chunk:
|
| 140 |
+
chunk_text = '. '.join(current_chunk) + '.'
|
| 141 |
+
chunks.append(chunk_text.strip())
|
| 142 |
+
|
| 143 |
+
return chunks
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def analyze_transcript_with_chunking(transcript):
|
| 147 |
+
"""
|
| 148 |
+
Analyze transcript using smart chunking for long content.
|
| 149 |
+
Processes each chunk separately and merges results.
|
| 150 |
+
"""
|
| 151 |
+
if len(transcript.split()) > 3000:
|
| 152 |
+
logger.info("📦 Transcript too long, using smart chunking...")
|
| 153 |
+
chunks = smart_chunk_transcript(transcript, max_tokens=3000)
|
| 154 |
+
all_segments = []
|
| 155 |
+
|
| 156 |
+
for i, chunk in enumerate(chunks):
|
| 157 |
+
logger.info(f"🔄 Processing chunk {i+1}/{len(chunks)}...")
|
| 158 |
+
result = analyze_transcript(chunk)
|
| 159 |
+
|
| 160 |
+
try:
|
| 161 |
+
data = json.loads(result['content'])
|
| 162 |
+
if 'segments' in data:
|
| 163 |
+
all_segments.extend(data['segments'])
|
| 164 |
+
except Exception as e:
|
| 165 |
+
logger.warning(f"⚠️ Failed to parse chunk {i+1}: {e}")
|
| 166 |
+
continue
|
| 167 |
+
|
| 168 |
+
if all_segments:
|
| 169 |
+
all_segments.sort(key=lambda x: x.get('viral_score', 0), reverse=True)
|
| 170 |
+
unique_segments = []
|
| 171 |
+
seen_times = set()
|
| 172 |
+
|
| 173 |
+
for seg in all_segments:
|
| 174 |
+
time_key = f"{seg.get('start_time', 0):.0f}-{seg.get('end_time', 0):.0f}"
|
| 175 |
+
if time_key not in seen_times:
|
| 176 |
+
unique_segments.append(seg)
|
| 177 |
+
seen_times.add(time_key)
|
| 178 |
+
|
| 179 |
+
return {"content": json.dumps({"segments": unique_segments[:10]})}
|
| 180 |
+
|
| 181 |
+
return analyze_transcript(transcript)
|
| 182 |
|
| 183 |
+
|
| 184 |
+
# Testing
|
| 185 |
if __name__ == "__main__":
|
| 186 |
+
test_transcript = """
|
| 187 |
+
[0.0 - 5.0] Welcome to today's video about productivity hacks that actually work.
|
| 188 |
+
[5.0 - 15.0] The first hack is something I call the 2-minute rule. If something takes less than 2 minutes, do it immediately.
|
| 189 |
+
[15.0 - 30.0] This simple rule has transformed my life. I used to procrastinate on small tasks, but now I handle them right away.
|
| 190 |
+
[30.0 - 45.0] The second hack is batching similar tasks together. Instead of checking email 20 times a day, I check it twice.
|
| 191 |
+
[45.0 - 60.0] This has saved me hours every week. I batch my emails, phone calls, and even errands.
|
| 192 |
+
[60.0 - 90.0] The third hack is the Pomodoro Technique. Work for 25 minutes, then take a 5-minute break.
|
| 193 |
+
[90.0 - 120.0] This technique helps me stay focused and avoid burnout. I get more done in less time.
|
| 194 |
+
"""
|
| 195 |
+
|
| 196 |
+
logger.info("🧪 Testing AI Analysis...")
|
| 197 |
+
result = analyze_transcript_with_chunking(test_transcript)
|
| 198 |
+
|
| 199 |
+
try:
|
| 200 |
+
data = json.loads(result['content'])
|
| 201 |
+
segments = data.get('segments', [])
|
| 202 |
+
logger.info(f"✅ Found {len(segments)} viral segments:")
|
| 203 |
+
|
| 204 |
+
for i, seg in enumerate(segments):
|
| 205 |
+
logger.info(f" #{i+1} [{seg['start_time']:.0f}s-{seg['end_time']:.0f}s] "
|
| 206 |
+
f"Score: {seg['viral_score']}/10 - {seg['description']}")
|
| 207 |
+
except Exception as e:
|
| 208 |
+
logger.error(f"❌ Error parsing result: {e}")
|
| 209 |
+
logger.info(f"Raw result: {result}")
|
core/config.py
CHANGED
|
@@ -13,11 +13,11 @@ Cyrillic: ru, uk (Ukrainian)
|
|
| 13 |
Hebrew: he
|
| 14 |
Thai: th
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
"""
|
| 22 |
import os
|
| 23 |
import re
|
|
@@ -35,193 +35,112 @@ class Config:
|
|
| 35 |
LOGS_DIR = os.path.join(BASE_DIR, "logs")
|
| 36 |
|
| 37 |
# ─────────────────────────────────────────────────────────────────────────
|
| 38 |
-
# Font Registry
|
| 39 |
-
# All URLs use Google Fonts CSS2 API — wght@700/800 = Bold
|
| 40 |
# ─────────────────────────────────────────────────────────────────────────
|
| 41 |
FONTS = {
|
| 42 |
|
| 43 |
-
# ── Latin / Universal
|
| 44 |
-
# ✅ Montserrat has BOTH Latin AND Cyrillic — #1 viral font
|
| 45 |
"Montserrat-Bold.ttf": "https://fonts.googleapis.com/css2?family=Montserrat:wght@700&display=swap",
|
| 46 |
-
# ✅ Rubik: modern, supports Latin + Cyrillic + Hebrew(!)
|
| 47 |
"Rubik-Bold.ttf": "https://fonts.googleapis.com/css2?family=Rubik:wght@700&display=swap",
|
| 48 |
-
# Oswald: condensed Latin only — fast speech / lots of words
|
| 49 |
"Oswald-Bold.ttf": "https://fonts.googleapis.com/css2?family=Oswald:wght@700&display=swap",
|
| 50 |
-
# Roboto: clean baseline, Latin + Cyrillic + Greek
|
| 51 |
"Roboto-Bold.ttf": "https://fonts.googleapis.com/css2?family=Roboto:wght@700&display=swap",
|
| 52 |
|
| 53 |
-
# ── Arabic Script
|
| 54 |
-
# ✅ #1 choice: Tajawal — modern social media Arabic, youth-oriented
|
| 55 |
"Tajawal-Bold.ttf": "https://fonts.googleapis.com/css2?family=Tajawal:wght@700&display=swap",
|
| 56 |
-
# Cairo: clean, highly legible — great for captions
|
| 57 |
"Cairo-Bold.ttf": "https://fonts.googleapis.com/css2?family=Cairo:wght@700&display=swap",
|
| 58 |
-
# Almarai: rounded, friendly — Gulf & Egyptian content
|
| 59 |
"Almarai-Bold.ttf": "https://fonts.googleapis.com/css2?family=Almarai:wght@800&display=swap",
|
| 60 |
-
# ✅ Noto Sans Arabic — universal fallback, covers ALL Arabic Unicode
|
| 61 |
"NotoSansArabic-Bold.ttf": "https://fonts.googleapis.com/css2?family=Noto+Sans+Arabic:wght@700&display=swap",
|
| 62 |
|
| 63 |
-
# ── Persian
|
| 64 |
-
# ✅ Vazirmatn: most popular Persian font on social media 2024
|
| 65 |
"Vazirmatn-Bold.ttf": "https://fonts.googleapis.com/css2?family=Vazirmatn:wght@700&display=swap",
|
| 66 |
|
| 67 |
-
# ── Urdu
|
| 68 |
-
# Using Noto Sans Arabic as best available web fallback
|
| 69 |
-
# Note: Authentic Urdu uses Nastaliq but it's not web-standard yet
|
| 70 |
"NotoSansArabicUrdu-Bold.ttf": "https://fonts.googleapis.com/css2?family=Noto+Sans+Arabic:wght@700&display=swap",
|
| 71 |
|
| 72 |
-
# ── Hebrew
|
| 73 |
-
# ✅ Rubik supports Hebrew natively (same font as Latin Rubik!)
|
| 74 |
-
# Frank Ruhl Libre: traditional Hebrew newspaper feel
|
| 75 |
"FrankRuhlLibre-Bold.ttf": "https://fonts.googleapis.com/css2?family=Frank+Ruhl+Libre:wght@700&display=swap",
|
| 76 |
-
# ✅ Heebo: modern clean Hebrew for captions
|
| 77 |
"Heebo-Bold.ttf": "https://fonts.googleapis.com/css2?family=Heebo:wght@700&display=swap",
|
| 78 |
|
| 79 |
# ── CJK ───────────────────────────────────────────────────────────────
|
| 80 |
-
# Chinese Simplified
|
| 81 |
"NotoSansSC-Bold.ttf": "https://fonts.googleapis.com/css2?family=Noto+Sans+SC:wght@700&display=swap",
|
| 82 |
-
# Chinese Traditional
|
| 83 |
"NotoSansTC-Bold.ttf": "https://fonts.googleapis.com/css2?family=Noto+Sans+TC:wght@700&display=swap",
|
| 84 |
-
# Japanese
|
| 85 |
"NotoSansJP-Bold.ttf": "https://fonts.googleapis.com/css2?family=Noto+Sans+JP:wght@700&display=swap",
|
| 86 |
-
# ✅ Korean — Noto Sans KR
|
| 87 |
"NotoSansKR-Bold.ttf": "https://fonts.googleapis.com/css2?family=Noto+Sans+KR:wght@700&display=swap",
|
| 88 |
|
| 89 |
-
# ── Devanagari
|
| 90 |
-
"NotoSansDevanagari-Bold.ttf":"https://fonts.googleapis.com/css2?family=Noto+Sans+Devanagari:wght@700&display=swap",
|
| 91 |
-
# ✅ Poppins: has Devanagari + Latin — great for bilingual Hindi content
|
| 92 |
"Poppins-Bold.ttf": "https://fonts.googleapis.com/css2?family=Poppins:wght@700&display=swap",
|
| 93 |
|
| 94 |
-
# ── Thai
|
| 95 |
-
# ✅ Sarabun: most popular Thai social media font, clean & modern
|
| 96 |
"Sarabun-Bold.ttf": "https://fonts.googleapis.com/css2?family=Sarabun:wght@700&display=swap",
|
| 97 |
-
# Noto Sans Thai: reliable fallback
|
| 98 |
"NotoSansThai-Bold.ttf": "https://fonts.googleapis.com/css2?family=Noto+Sans+Thai:wght@700&display=swap",
|
| 99 |
|
| 100 |
-
# ──
|
| 101 |
-
# Montserrat covers Ukrainian Cyrillic, but for dedicated support:
|
| 102 |
"NotoSans-Bold.ttf": "https://fonts.googleapis.com/css2?family=Noto+Sans:wght@700&display=swap",
|
| 103 |
}
|
| 104 |
|
| 105 |
# ─────────────────────────────────────────────────────────────────────────
|
| 106 |
-
# Language →
|
| 107 |
-
#
|
| 108 |
-
# Priority: most viral / readable on mobile screens
|
| 109 |
-
# Rule: non-Latin scripts ALWAYS override style font
|
| 110 |
# ─────────────────────────────────────────────────────────────────────────
|
| 111 |
LANGUAGE_FONT_MAP = {
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
"
|
| 116 |
-
"
|
| 117 |
-
"
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
"
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
"
|
| 124 |
-
"
|
| 125 |
-
"
|
| 126 |
-
"
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
"
|
| 130 |
-
"
|
| 131 |
-
"
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
"
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
"
|
| 140 |
-
"uk": "Montserrat-Bold.ttf", # Ukrainian (Cyrillic) — was missing
|
| 141 |
-
|
| 142 |
-
# ── Latin Languages ────────────────────────────────────────────────────
|
| 143 |
-
"en": "Montserrat-Bold.ttf", # English
|
| 144 |
-
"fr": "Montserrat-Bold.ttf", # French
|
| 145 |
-
"es": "Montserrat-Bold.ttf", # Spanish
|
| 146 |
-
"de": "Montserrat-Bold.ttf", # German
|
| 147 |
-
"pt": "Montserrat-Bold.ttf", # Portuguese (Brazil + Portugal)
|
| 148 |
-
"it": "Montserrat-Bold.ttf", # Italian
|
| 149 |
-
"tr": "Montserrat-Bold.ttf", # Turkish (Latin script since 1928)
|
| 150 |
-
"nl": "Montserrat-Bold.ttf", # ✅ Dutch (was missing)
|
| 151 |
-
"pl": "Montserrat-Bold.ttf", # ✅ Polish (was missing)
|
| 152 |
-
"id": "Montserrat-Bold.ttf", # ✅ Indonesian (was missing)
|
| 153 |
-
"vi": "Roboto-Bold.ttf", # ✅ Vietnamese — Roboto has better
|
| 154 |
-
# diacritic coverage (tones)
|
| 155 |
-
"sv": "Montserrat-Bold.ttf", # ✅ Swedish (was missing)
|
| 156 |
-
"ro": "Montserrat-Bold.ttf", # ✅ Romanian (was missing)
|
| 157 |
-
|
| 158 |
-
# ── Fallback ───────────────────────────────────────────────────────────
|
| 159 |
-
# Noto Sans: designed to cover ALL Unicode — zero missing glyphs
|
| 160 |
-
# Better than Montserrat for unknown scripts
|
| 161 |
-
"default": "NotoSans-Bold.ttf", # ⬆️ Upgraded from Montserrat
|
| 162 |
}
|
| 163 |
|
| 164 |
-
# ─────────────────────────────────────────────────────────────────────────
|
| 165 |
-
# Caption Style → Preferred Font (Latin-only styles)
|
| 166 |
-
#
|
| 167 |
-
# IMPORTANT: Non-Latin scripts ALWAYS use LANGUAGE_FONT_MAP regardless
|
| 168 |
-
# of style. This map only applies when language is Latin/Cyrillic.
|
| 169 |
-
# ─────────────────────────────────────────────────────────────────────────
|
| 170 |
STYLE_FONT_MAP = {
|
| 171 |
-
# Montserrat: #1 viral font — Alex Hormozi, MrBeast, Sidemen
|
| 172 |
"classic": "Montserrat-Bold.ttf",
|
| 173 |
-
|
| 174 |
-
# Rubik: distinctive modern feel — supports Latin + Cyrillic + Hebrew
|
| 175 |
-
# ✅ Better than original for multilingual content
|
| 176 |
"modern_glow": "Rubik-Bold.ttf",
|
| 177 |
-
|
| 178 |
-
# Montserrat: proven viral MrBeast aesthetic
|
| 179 |
"tiktok_bold": "Montserrat-Bold.ttf",
|
| 180 |
-
|
| 181 |
-
# ✅ Changed: Oswald has NO Arabic/CJK support
|
| 182 |
-
# Using Montserrat which handles more scripts gracefully
|
| 183 |
-
# For pure Latin content, Oswald (condensed) is still good
|
| 184 |
-
"tiktok_neon": "Montserrat-Bold.ttf", # was Oswald-Bold (no Arabic!)
|
| 185 |
-
|
| 186 |
-
# Rubik: clean educator look + multilingual
|
| 187 |
"youtube_clean": "Rubik-Bold.ttf",
|
| 188 |
-
|
| 189 |
-
# Montserrat: karaoke / game-show energy
|
| 190 |
"youtube_box": "Montserrat-Bold.ttf",
|
| 191 |
}
|
| 192 |
|
| 193 |
-
# ─────────────────────────────────────────────────────────────────────────
|
| 194 |
-
# Unicode Range → Language Detection
|
| 195 |
-
# Used in ensure_font() for script auto-detection
|
| 196 |
-
# ─────────────────────────────────────────────────────────────────────────
|
| 197 |
UNICODE_SCRIPT_RANGES = [
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
("\
|
| 201 |
-
("\
|
| 202 |
-
("\
|
| 203 |
-
("\
|
| 204 |
-
("\
|
| 205 |
-
("\
|
| 206 |
-
("\
|
| 207 |
-
("\
|
| 208 |
-
("\
|
| 209 |
-
("\
|
| 210 |
-
("\
|
| 211 |
-
("\
|
| 212 |
-
("\u0400", "\u04FF", "ru"), # Cyrillic
|
| 213 |
-
("\u0500", "\u052F", "ru"), # Cyrillic Supplement
|
| 214 |
]
|
| 215 |
|
| 216 |
-
# ─────────────────────────────────────────────────────────────────────────
|
| 217 |
-
# RTL Languages (Right-to-Left)
|
| 218 |
-
# Used for text rendering direction
|
| 219 |
-
# ─────────────────────────────────────────────────────────────────────────
|
| 220 |
RTL_LANGUAGES = {"ar", "fa", "ur", "he"}
|
| 221 |
|
| 222 |
-
# ─────────────────────────────────────────────────────────────────────────
|
| 223 |
-
# Video settings
|
| 224 |
-
# ─────────────────────────────────────────────────────────────────────────
|
| 225 |
DEFAULT_SIZE = (1080, 1920)
|
| 226 |
CHUNK_SIZE_SECONDS = 600
|
| 227 |
OVERLAP_SECONDS = 60
|
|
@@ -243,61 +162,34 @@ class Config:
|
|
| 243 |
os.makedirs(d, exist_ok=True)
|
| 244 |
|
| 245 |
# ─────────────────────────────────────────────────────────────────────────
|
| 246 |
-
# Language detection
|
| 247 |
# ─────────────────────────────────────────────────────────────────────────
|
| 248 |
@classmethod
|
| 249 |
def detect_language_from_text(cls, text: str) -> str | None:
|
| 250 |
-
"""
|
| 251 |
-
Detects script/language from Unicode character ranges.
|
| 252 |
-
Returns language code or None if only Latin/ASCII detected.
|
| 253 |
-
|
| 254 |
-
More reliable than the original inline checks in ensure_font()
|
| 255 |
-
because it covers Korean, Thai, Hebrew, Persian, and more.
|
| 256 |
-
"""
|
| 257 |
if not text:
|
| 258 |
return None
|
| 259 |
-
|
| 260 |
for start, end, lang in cls.UNICODE_SCRIPT_RANGES:
|
| 261 |
if any(start <= c <= end for c in text):
|
| 262 |
return lang
|
| 263 |
-
|
| 264 |
-
return None # Latin / unknown
|
| 265 |
|
| 266 |
@classmethod
|
| 267 |
def is_rtl(cls, language: str) -> bool:
|
| 268 |
-
"""Returns True if language is right-to-left."""
|
| 269 |
return language in cls.RTL_LANGUAGES
|
| 270 |
|
| 271 |
@classmethod
|
| 272 |
def get_font_for_language(cls, language: str, style_name: str = None) -> str:
|
| 273 |
-
"""
|
| 274 |
-
Returns the best font filename for a given language + style combination.
|
| 275 |
-
|
| 276 |
-
Priority:
|
| 277 |
-
1. Non-Latin scripts → always use LANGUAGE_FONT_MAP (ignores style)
|
| 278 |
-
2. Latin with explicit style → use STYLE_FONT_MAP
|
| 279 |
-
3. Latin with known language → use LANGUAGE_FONT_MAP
|
| 280 |
-
4. Unknown → use LANGUAGE_FONT_MAP default
|
| 281 |
-
"""
|
| 282 |
NON_LATIN = {
|
| 283 |
"ar", "fa", "ur", "he",
|
| 284 |
"zh", "zh-tw", "ja", "ko",
|
| 285 |
-
"hi", "mr", "ne",
|
| 286 |
-
"th",
|
| 287 |
}
|
| 288 |
-
|
| 289 |
-
# Non-Latin: always use language map regardless of style
|
| 290 |
if language in NON_LATIN:
|
| 291 |
return cls.LANGUAGE_FONT_MAP.get(language, cls.LANGUAGE_FONT_MAP["default"])
|
| 292 |
-
|
| 293 |
-
# Latin/Cyrillic with style preference
|
| 294 |
if style_name and style_name in cls.STYLE_FONT_MAP:
|
| 295 |
return cls.STYLE_FONT_MAP[style_name]
|
| 296 |
-
|
| 297 |
-
# Latin with known language
|
| 298 |
if language in cls.LANGUAGE_FONT_MAP:
|
| 299 |
return cls.LANGUAGE_FONT_MAP[language]
|
| 300 |
-
|
| 301 |
return cls.LANGUAGE_FONT_MAP["default"]
|
| 302 |
|
| 303 |
# ─────────────────────────────────────────────────────────────────────────
|
|
@@ -307,35 +199,65 @@ class Config:
|
|
| 307 |
def get_urls(css_content: str, prefer_latin: bool = True) -> list:
|
| 308 |
"""
|
| 309 |
Extracts font file URLs from a Google Fonts CSS response.
|
| 310 |
-
Prefers
|
| 311 |
"""
|
|
|
|
| 312 |
pattern = re.compile(
|
| 313 |
r'/\*\s*\[?\d*\]?\s*([\w\-]+)\s*\*/[^}]*?url\(([^)]+)\)',
|
| 314 |
re.DOTALL,
|
| 315 |
)
|
| 316 |
pairs = pattern.findall(css_content)
|
| 317 |
|
| 318 |
-
if
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
|
|
|
|
|
|
|
|
|
|
| 333 |
@staticmethod
|
| 334 |
def download_font_from_css(css_url: str, output_path: str) -> bool:
|
| 335 |
"""
|
| 336 |
-
Downloads the correct font file for a given CSS URL.
|
| 337 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
NON_LATIN_KEYWORDS = (
|
| 340 |
"arabic", "noto", "devanagari", "sc", "jp", "kr", "tc",
|
| 341 |
"thai", "sarabun", "heebo", "frank", "vazir", "tajawal",
|
|
@@ -345,39 +267,81 @@ class Config:
|
|
| 345 |
is_non_latin = any(kw in filename for kw in NON_LATIN_KEYWORDS)
|
| 346 |
prefer_latin = not is_non_latin
|
| 347 |
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
Hebrew: he
|
| 14 |
Thai: th
|
| 15 |
|
| 16 |
+
FONT DOWNLOAD FIX:
|
| 17 |
+
Google Fonts returns woff2 for modern browsers — Pillow cannot load woff2.
|
| 18 |
+
Solution: use an old IE User-Agent to force Google Fonts to return TTF URLs.
|
| 19 |
+
Modern UA → fonts.gstatic.com/s/cairo/xxx.woff2 ← Pillow FAILS
|
| 20 |
+
Old IE UA → fonts.gstatic.com/s/cairo/xxx.ttf ← Pillow works ✅
|
| 21 |
"""
|
| 22 |
import os
|
| 23 |
import re
|
|
|
|
| 35 |
LOGS_DIR = os.path.join(BASE_DIR, "logs")
|
| 36 |
|
| 37 |
# ─────────────────────────────────────────────────────────────────────────
|
| 38 |
+
# Font Registry — Google Fonts CSS2 API URLs
|
|
|
|
| 39 |
# ─────────────────────────────────────────────────────────────────────────
|
| 40 |
FONTS = {
|
| 41 |
|
| 42 |
+
# ── Latin / Universal ──────────────────────────────────────────────────
|
|
|
|
| 43 |
"Montserrat-Bold.ttf": "https://fonts.googleapis.com/css2?family=Montserrat:wght@700&display=swap",
|
|
|
|
| 44 |
"Rubik-Bold.ttf": "https://fonts.googleapis.com/css2?family=Rubik:wght@700&display=swap",
|
|
|
|
| 45 |
"Oswald-Bold.ttf": "https://fonts.googleapis.com/css2?family=Oswald:wght@700&display=swap",
|
|
|
|
| 46 |
"Roboto-Bold.ttf": "https://fonts.googleapis.com/css2?family=Roboto:wght@700&display=swap",
|
| 47 |
|
| 48 |
+
# ── Arabic Script ──────────────────────────────────────────────────────
|
|
|
|
| 49 |
"Tajawal-Bold.ttf": "https://fonts.googleapis.com/css2?family=Tajawal:wght@700&display=swap",
|
|
|
|
| 50 |
"Cairo-Bold.ttf": "https://fonts.googleapis.com/css2?family=Cairo:wght@700&display=swap",
|
|
|
|
| 51 |
"Almarai-Bold.ttf": "https://fonts.googleapis.com/css2?family=Almarai:wght@800&display=swap",
|
|
|
|
| 52 |
"NotoSansArabic-Bold.ttf": "https://fonts.googleapis.com/css2?family=Noto+Sans+Arabic:wght@700&display=swap",
|
| 53 |
|
| 54 |
+
# ── Persian ────────────────────────────────────────────────────────────
|
|
|
|
| 55 |
"Vazirmatn-Bold.ttf": "https://fonts.googleapis.com/css2?family=Vazirmatn:wght@700&display=swap",
|
| 56 |
|
| 57 |
+
# ── Urdu ───────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
| 58 |
"NotoSansArabicUrdu-Bold.ttf": "https://fonts.googleapis.com/css2?family=Noto+Sans+Arabic:wght@700&display=swap",
|
| 59 |
|
| 60 |
+
# ── Hebrew ───────────────────��─────────────────────────────────────────
|
|
|
|
|
|
|
| 61 |
"FrankRuhlLibre-Bold.ttf": "https://fonts.googleapis.com/css2?family=Frank+Ruhl+Libre:wght@700&display=swap",
|
|
|
|
| 62 |
"Heebo-Bold.ttf": "https://fonts.googleapis.com/css2?family=Heebo:wght@700&display=swap",
|
| 63 |
|
| 64 |
# ── CJK ───────────────────────────────────────────────────────────────
|
|
|
|
| 65 |
"NotoSansSC-Bold.ttf": "https://fonts.googleapis.com/css2?family=Noto+Sans+SC:wght@700&display=swap",
|
|
|
|
| 66 |
"NotoSansTC-Bold.ttf": "https://fonts.googleapis.com/css2?family=Noto+Sans+TC:wght@700&display=swap",
|
|
|
|
| 67 |
"NotoSansJP-Bold.ttf": "https://fonts.googleapis.com/css2?family=Noto+Sans+JP:wght@700&display=swap",
|
|
|
|
| 68 |
"NotoSansKR-Bold.ttf": "https://fonts.googleapis.com/css2?family=Noto+Sans+KR:wght@700&display=swap",
|
| 69 |
|
| 70 |
+
# ── Devanagari ────────────────────────────────────────────────────────
|
| 71 |
+
"NotoSansDevanagari-Bold.ttf": "https://fonts.googleapis.com/css2?family=Noto+Sans+Devanagari:wght@700&display=swap",
|
|
|
|
| 72 |
"Poppins-Bold.ttf": "https://fonts.googleapis.com/css2?family=Poppins:wght@700&display=swap",
|
| 73 |
|
| 74 |
+
# ── Thai ──────────────────────────────────────────────────────────────
|
|
|
|
| 75 |
"Sarabun-Bold.ttf": "https://fonts.googleapis.com/css2?family=Sarabun:wght@700&display=swap",
|
|
|
|
| 76 |
"NotoSansThai-Bold.ttf": "https://fonts.googleapis.com/css2?family=Noto+Sans+Thai:wght@700&display=swap",
|
| 77 |
|
| 78 |
+
# ── Universal fallback ─────────────────────────────────────────────────
|
|
|
|
| 79 |
"NotoSans-Bold.ttf": "https://fonts.googleapis.com/css2?family=Noto+Sans:wght@700&display=swap",
|
| 80 |
}
|
| 81 |
|
| 82 |
# ─────────────────────────────────────────────────────────────────────────
|
| 83 |
+
# Language → Font
|
|
|
|
|
|
|
|
|
|
| 84 |
# ─────────────────────────────────────────────────────────────────────────
|
| 85 |
LANGUAGE_FONT_MAP = {
|
| 86 |
+
"ar": "Tajawal-Bold.ttf",
|
| 87 |
+
"fa": "Vazirmatn-Bold.ttf",
|
| 88 |
+
"ur": "NotoSansArabic-Bold.ttf",
|
| 89 |
+
"he": "Heebo-Bold.ttf",
|
| 90 |
+
"zh": "NotoSansSC-Bold.ttf",
|
| 91 |
+
"zh-tw": "NotoSansTC-Bold.ttf",
|
| 92 |
+
"ja": "NotoSansJP-Bold.ttf",
|
| 93 |
+
"ko": "NotoSansKR-Bold.ttf",
|
| 94 |
+
"hi": "NotoSansDevanagari-Bold.ttf",
|
| 95 |
+
"mr": "NotoSansDevanagari-Bold.ttf",
|
| 96 |
+
"ne": "NotoSansDevanagari-Bold.ttf",
|
| 97 |
+
"th": "Sarabun-Bold.ttf",
|
| 98 |
+
"ru": "Montserrat-Bold.ttf",
|
| 99 |
+
"uk": "Montserrat-Bold.ttf",
|
| 100 |
+
"en": "Montserrat-Bold.ttf",
|
| 101 |
+
"fr": "Montserrat-Bold.ttf",
|
| 102 |
+
"es": "Montserrat-Bold.ttf",
|
| 103 |
+
"de": "Montserrat-Bold.ttf",
|
| 104 |
+
"pt": "Montserrat-Bold.ttf",
|
| 105 |
+
"it": "Montserrat-Bold.ttf",
|
| 106 |
+
"tr": "Montserrat-Bold.ttf",
|
| 107 |
+
"nl": "Montserrat-Bold.ttf",
|
| 108 |
+
"pl": "Montserrat-Bold.ttf",
|
| 109 |
+
"id": "Montserrat-Bold.ttf",
|
| 110 |
+
"vi": "Roboto-Bold.ttf",
|
| 111 |
+
"sv": "Montserrat-Bold.ttf",
|
| 112 |
+
"ro": "Montserrat-Bold.ttf",
|
| 113 |
+
"default": "NotoSans-Bold.ttf",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
}
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
STYLE_FONT_MAP = {
|
|
|
|
| 117 |
"classic": "Montserrat-Bold.ttf",
|
|
|
|
|
|
|
|
|
|
| 118 |
"modern_glow": "Rubik-Bold.ttf",
|
|
|
|
|
|
|
| 119 |
"tiktok_bold": "Montserrat-Bold.ttf",
|
| 120 |
+
"tiktok_neon": "Montserrat-Bold.ttf",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
"youtube_clean": "Rubik-Bold.ttf",
|
|
|
|
|
|
|
| 122 |
"youtube_box": "Montserrat-Bold.ttf",
|
| 123 |
}
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
UNICODE_SCRIPT_RANGES = [
|
| 126 |
+
("\u0600", "\u06FF", "ar"),
|
| 127 |
+
("\u0750", "\u077F", "ar"),
|
| 128 |
+
("\u08A0", "\u08FF", "ar"),
|
| 129 |
+
("\u0590", "\u05FF", "he"),
|
| 130 |
+
("\uAC00", "\uD7AF", "ko"),
|
| 131 |
+
("\u1100", "\u11FF", "ko"),
|
| 132 |
+
("\u4E00", "\u9FFF", "zh"),
|
| 133 |
+
("\u3400", "\u4DBF", "zh"),
|
| 134 |
+
("\u3040", "\u309F", "ja"),
|
| 135 |
+
("\u30A0", "\u30FF", "ja"),
|
| 136 |
+
("\u0900", "\u097F", "hi"),
|
| 137 |
+
("\u0E00", "\u0E7F", "th"),
|
| 138 |
+
("\u0400", "\u04FF", "ru"),
|
| 139 |
+
("\u0500", "\u052F", "ru"),
|
|
|
|
|
|
|
| 140 |
]
|
| 141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
RTL_LANGUAGES = {"ar", "fa", "ur", "he"}
|
| 143 |
|
|
|
|
|
|
|
|
|
|
| 144 |
DEFAULT_SIZE = (1080, 1920)
|
| 145 |
CHUNK_SIZE_SECONDS = 600
|
| 146 |
OVERLAP_SECONDS = 60
|
|
|
|
| 162 |
os.makedirs(d, exist_ok=True)
|
| 163 |
|
| 164 |
# ─────────────────────────────────────────────────────────────────────────
|
| 165 |
+
# Language detection
|
| 166 |
# ─────────────────────────────────────────────────────────────────────────
|
| 167 |
@classmethod
|
| 168 |
def detect_language_from_text(cls, text: str) -> str | None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
if not text:
|
| 170 |
return None
|
|
|
|
| 171 |
for start, end, lang in cls.UNICODE_SCRIPT_RANGES:
|
| 172 |
if any(start <= c <= end for c in text):
|
| 173 |
return lang
|
| 174 |
+
return None
|
|
|
|
| 175 |
|
| 176 |
@classmethod
|
| 177 |
def is_rtl(cls, language: str) -> bool:
|
|
|
|
| 178 |
return language in cls.RTL_LANGUAGES
|
| 179 |
|
| 180 |
@classmethod
|
| 181 |
def get_font_for_language(cls, language: str, style_name: str = None) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
NON_LATIN = {
|
| 183 |
"ar", "fa", "ur", "he",
|
| 184 |
"zh", "zh-tw", "ja", "ko",
|
| 185 |
+
"hi", "mr", "ne", "th",
|
|
|
|
| 186 |
}
|
|
|
|
|
|
|
| 187 |
if language in NON_LATIN:
|
| 188 |
return cls.LANGUAGE_FONT_MAP.get(language, cls.LANGUAGE_FONT_MAP["default"])
|
|
|
|
|
|
|
| 189 |
if style_name and style_name in cls.STYLE_FONT_MAP:
|
| 190 |
return cls.STYLE_FONT_MAP[style_name]
|
|
|
|
|
|
|
| 191 |
if language in cls.LANGUAGE_FONT_MAP:
|
| 192 |
return cls.LANGUAGE_FONT_MAP[language]
|
|
|
|
| 193 |
return cls.LANGUAGE_FONT_MAP["default"]
|
| 194 |
|
| 195 |
# ─────────────────────────────────────────────────────────────────────────
|
|
|
|
| 199 |
def get_urls(css_content: str, prefer_latin: bool = True) -> list:
|
| 200 |
"""
|
| 201 |
Extracts font file URLs from a Google Fonts CSS response.
|
| 202 |
+
Prefers TTF over woff2 because Pillow cannot load woff2.
|
| 203 |
"""
|
| 204 |
+
# Extract all (subset_comment, url) pairs
|
| 205 |
pattern = re.compile(
|
| 206 |
r'/\*\s*\[?\d*\]?\s*([\w\-]+)\s*\*/[^}]*?url\(([^)]+)\)',
|
| 207 |
re.DOTALL,
|
| 208 |
)
|
| 209 |
pairs = pattern.findall(css_content)
|
| 210 |
|
| 211 |
+
if pairs:
|
| 212 |
+
subset_map = {s.lower(): u.strip().strip("'\"") for s, u in pairs}
|
| 213 |
+
if prefer_latin:
|
| 214 |
+
for key in ("latin", "latin-ext"):
|
| 215 |
+
if key in subset_map:
|
| 216 |
+
return [subset_map[key]]
|
| 217 |
+
return [list(subset_map.values())[-1]]
|
| 218 |
+
else:
|
| 219 |
+
return [list(subset_map.values())[0]]
|
| 220 |
+
|
| 221 |
+
# Fallback: grab all raw URLs
|
| 222 |
+
all_urls = re.findall(r'url\(([^)]+)\)', css_content)
|
| 223 |
+
all_urls = [u.strip().strip("'\"") for u in all_urls]
|
| 224 |
+
|
| 225 |
+
# Prefer TTF, then woff (not woff2 — Pillow can't open woff2)
|
| 226 |
+
ttf = [u for u in all_urls if u.endswith(".ttf")]
|
| 227 |
+
woff = [u for u in all_urls if u.endswith(".woff") and not u.endswith(".woff2")]
|
| 228 |
+
return ttf or woff or all_urls
|
| 229 |
|
| 230 |
+
# ─────────────────────────────────────────────────────────────────────────
|
| 231 |
+
# Font CSS download ← FIXED: uses TTF-forcing User-Agent
|
| 232 |
+
# ─────────────────────────────────────────────────────────────────────────
|
| 233 |
@staticmethod
|
| 234 |
def download_font_from_css(css_url: str, output_path: str) -> bool:
|
| 235 |
"""
|
| 236 |
+
Downloads the correct font file for a given Google Fonts CSS URL.
|
| 237 |
+
|
| 238 |
+
KEY FIX: Uses an old IE 6 User-Agent to force Google Fonts to return
|
| 239 |
+
TTF URLs instead of woff2. Pillow/FreeType cannot open woff2 files.
|
| 240 |
+
|
| 241 |
+
Modern Chrome UA → Google returns .woff2 → Pillow FAILS ❌
|
| 242 |
+
Old IE 6 UA → Google returns .ttf → Pillow works ✅
|
| 243 |
+
|
| 244 |
+
Two-pass strategy:
|
| 245 |
+
Pass 1: Old IE UA → gets TTF (ideal for Pillow)
|
| 246 |
+
Pass 2: Modern UA → gets woff2 as last resort (may fail in Pillow)
|
| 247 |
"""
|
| 248 |
+
# ── User-Agent constants ──────────────────────────────────────────────
|
| 249 |
+
# IE 6 on Windows XP — forces Google Fonts to return legacy TTF format
|
| 250 |
+
UA_TTF = (
|
| 251 |
+
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; "
|
| 252 |
+
"SV1; .NET CLR 1.1.4322)"
|
| 253 |
+
)
|
| 254 |
+
# Modern Chrome — returns woff2 (not ideal for Pillow, last resort)
|
| 255 |
+
UA_MODERN = (
|
| 256 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
| 257 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
| 258 |
+
"Chrome/124.0.0.0 Safari/537.36"
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
NON_LATIN_KEYWORDS = (
|
| 262 |
"arabic", "noto", "devanagari", "sc", "jp", "kr", "tc",
|
| 263 |
"thai", "sarabun", "heebo", "frank", "vazir", "tajawal",
|
|
|
|
| 267 |
is_non_latin = any(kw in filename for kw in NON_LATIN_KEYWORDS)
|
| 268 |
prefer_latin = not is_non_latin
|
| 269 |
|
| 270 |
+
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
|
| 271 |
+
|
| 272 |
+
for pass_num, ua in enumerate([UA_TTF, UA_MODERN], start=1):
|
| 273 |
+
ua_label = "TTF-forcing (IE6)" if pass_num == 1 else "Modern (woff2 fallback)"
|
| 274 |
+
try:
|
| 275 |
+
# ── Fetch CSS ─────────────────────────────────────────────────
|
| 276 |
+
resp = requests.get(
|
| 277 |
+
css_url,
|
| 278 |
+
headers={"User-Agent": ua},
|
| 279 |
+
timeout=15
|
| 280 |
+
)
|
| 281 |
+
resp.raise_for_status()
|
| 282 |
+
|
| 283 |
+
urls = Config.get_urls(resp.text, prefer_latin=prefer_latin)
|
| 284 |
+
if not urls:
|
| 285 |
+
print(f"⚠️ Pass {pass_num} ({ua_label}): no font URLs in CSS")
|
| 286 |
+
continue
|
| 287 |
+
|
| 288 |
+
font_url = urls[0]
|
| 289 |
+
ext = os.path.splitext(font_url.split("?")[0])[-1].lower()
|
| 290 |
+
print(f"⬇️ Pass {pass_num} ({ua_label}): {ext} → {font_url[:70]}…")
|
| 291 |
+
|
| 292 |
+
# ── Download font file ────────────────────────────────────────
|
| 293 |
+
font_resp = requests.get(
|
| 294 |
+
font_url,
|
| 295 |
+
headers={"User-Agent": UA_MODERN},
|
| 296 |
+
timeout=30
|
| 297 |
+
)
|
| 298 |
+
font_resp.raise_for_status()
|
| 299 |
+
data = font_resp.content
|
| 300 |
+
|
| 301 |
+
# ── Validate: check magic bytes ───────────────────────────────
|
| 302 |
+
if len(data) < 10_000:
|
| 303 |
+
print(f"⚠️ File too small ({len(data)} B) — likely error page, skipping")
|
| 304 |
+
continue
|
| 305 |
+
|
| 306 |
+
magic = data[:4]
|
| 307 |
+
is_ttf_magic = magic in (
|
| 308 |
+
b"\x00\x01\x00\x00", # TrueType
|
| 309 |
+
b"OTTO", # OpenType CFF
|
| 310 |
+
b"true", # TrueType variant
|
| 311 |
+
b"wOFF", # WOFF (Pillow ≥ 9.2 can open)
|
| 312 |
+
b"wOF2", # WOFF2 (Pillow may fail)
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
if not is_ttf_magic:
|
| 316 |
+
print(
|
| 317 |
+
f"⚠️ Pass {pass_num}: unexpected magic bytes {magic.hex()} "
|
| 318 |
+
f"(probably HTML error page) — skipping"
|
| 319 |
+
)
|
| 320 |
+
continue
|
| 321 |
+
|
| 322 |
+
if magic == b"wOF2":
|
| 323 |
+
print(
|
| 324 |
+
f"⚠️ Pass {pass_num}: received WOFF2 — "
|
| 325 |
+
f"Pillow may not be able to open this. "
|
| 326 |
+
f"Consider installing: sudo apt-get install fonts-noto-core"
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
with open(output_path, "wb") as f:
|
| 330 |
+
f.write(data)
|
| 331 |
+
|
| 332 |
+
print(f"✅ Font saved ({len(data):,} B, {ext}): {output_path}")
|
| 333 |
+
return True
|
| 334 |
+
|
| 335 |
+
except requests.RequestException as e:
|
| 336 |
+
print(f"❌ Pass {pass_num} network error: {e}")
|
| 337 |
+
except Exception as e:
|
| 338 |
+
print(f"❌ Pass {pass_num} unexpected error: {e}")
|
| 339 |
+
|
| 340 |
+
# ── Both passes failed ────────────────────────────────────────────────
|
| 341 |
+
print(
|
| 342 |
+
f"❌ All download attempts failed for {os.path.basename(output_path)}.\n"
|
| 343 |
+
f" Fix on Ubuntu/Debian:\n"
|
| 344 |
+
f" sudo apt-get install -y fonts-noto-core fonts-arabeyes\n"
|
| 345 |
+
f" Or copy a TTF manually to: {output_path}"
|
| 346 |
+
)
|
| 347 |
+
return False
|
core/free_translator.py
CHANGED
|
@@ -7,32 +7,24 @@ class FreeTranslator:
|
|
| 7 |
def __init__(self):
|
| 8 |
pass
|
| 9 |
|
| 10 |
-
def translate_text(self, text, target_language_code):
|
| 11 |
"""ترجمة مجانية باستخدام MyMemory API بدون httpx"""
|
| 12 |
if not text.strip():
|
| 13 |
return "", []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
"ar": "ar",
|
| 18 |
-
"en": "en",
|
| 19 |
-
"hi": "hi",
|
| 20 |
-
"zh": "zh",
|
| 21 |
-
"es": "es",
|
| 22 |
-
"fr": "fr",
|
| 23 |
-
"de": "de",
|
| 24 |
-
"ru": "ru",
|
| 25 |
-
"ja": "ja"
|
| 26 |
-
}
|
| 27 |
-
|
| 28 |
-
target_lang = lang_map.get(target_language_code, target_language_code)
|
| 29 |
|
| 30 |
try:
|
| 31 |
# استخدام urllib بدلاً من requests لتجنب مشكلة httpx
|
| 32 |
url = "https://api.mymemory.translated.net/get"
|
| 33 |
params = {
|
| 34 |
'q': text,
|
| 35 |
-
'langpair': f'
|
| 36 |
}
|
| 37 |
|
| 38 |
# بناء URL مع parameters
|
|
|
|
| 7 |
def __init__(self):
|
| 8 |
pass
|
| 9 |
|
| 10 |
+
def translate_text(self, text, target_language_code, source_language_code="en"):
|
| 11 |
"""ترجمة مجانية باستخدام MyMemory API بدون httpx"""
|
| 12 |
if not text.strip():
|
| 13 |
return "", []
|
| 14 |
+
|
| 15 |
+
# Handle same language case
|
| 16 |
+
if source_language_code.lower() == target_language_code.lower():
|
| 17 |
+
return text, []
|
| 18 |
|
| 19 |
+
target_lang = target_language_code.lower()
|
| 20 |
+
source_lang = source_language_code.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
try:
|
| 23 |
# استخدام urllib بدلاً من requests لتجنب مشكلة httpx
|
| 24 |
url = "https://api.mymemory.translated.net/get"
|
| 25 |
params = {
|
| 26 |
'q': text,
|
| 27 |
+
'langpair': f'{source_lang}|{target_lang}'
|
| 28 |
}
|
| 29 |
|
| 30 |
# بناء URL مع parameters
|
core/stt.py
CHANGED
|
@@ -33,7 +33,7 @@ SUBTITLE_STANDARDS = {
|
|
| 33 |
}
|
| 34 |
|
| 35 |
# Sentence-ending punctuation for smart splitting
|
| 36 |
-
SENTENCE_ENDINGS
|
| 37 |
CLAUSE_BOUNDARIES = re.compile(r'[,،;:،]+$')
|
| 38 |
|
| 39 |
|
|
@@ -49,7 +49,6 @@ class SubtitleSegmenter:
|
|
| 49 |
|
| 50 |
@staticmethod
|
| 51 |
def count_chars(text: str) -> int:
|
| 52 |
-
"""Count displayable characters (strip extra spaces)."""
|
| 53 |
return len(text.strip())
|
| 54 |
|
| 55 |
@staticmethod
|
|
@@ -62,7 +61,6 @@ class SubtitleSegmenter:
|
|
| 62 |
|
| 63 |
@staticmethod
|
| 64 |
def calc_min_duration(text: str) -> float:
|
| 65 |
-
"""Minimum display duration based on reading speed (EBU R37)."""
|
| 66 |
chars = SubtitleSegmenter.count_chars(text)
|
| 67 |
cps = SUBTITLE_STANDARDS["reading_speed_cps"]
|
| 68 |
return max(chars / cps, SUBTITLE_STANDARDS["min_duration_sec"])
|
|
@@ -73,13 +71,6 @@ class SubtitleSegmenter:
|
|
| 73 |
Splits a flat list of word dicts into subtitle blocks following
|
| 74 |
international standards. Each block has:
|
| 75 |
{ text, start, end, words, line1, line2 }
|
| 76 |
-
|
| 77 |
-
Priority for line breaks:
|
| 78 |
-
1. Sentence endings (.!?)
|
| 79 |
-
2. Clause boundaries (,;:)
|
| 80 |
-
3. Max chars per line (42)
|
| 81 |
-
4. Max words per block
|
| 82 |
-
5. Pause gaps in audio (> 0.5s)
|
| 83 |
"""
|
| 84 |
if not words:
|
| 85 |
return []
|
|
@@ -89,18 +80,15 @@ class SubtitleSegmenter:
|
|
| 89 |
MAX_WORDS = SUBTITLE_STANDARDS["max_words_per_block"]
|
| 90 |
PAUSE_GAP = SUBTITLE_STANDARDS["sentence_pause_gap"]
|
| 91 |
|
| 92 |
-
blocks
|
| 93 |
current_words = []
|
| 94 |
current_chars = 0
|
| 95 |
|
| 96 |
def flush_block(word_list):
|
| 97 |
-
"""Convert accumulated words into a subtitle block with line splitting."""
|
| 98 |
if not word_list:
|
| 99 |
return None
|
| 100 |
-
|
| 101 |
full_text = " ".join(w["text"] for w in word_list)
|
| 102 |
lines = SubtitleSegmenter._split_into_lines(full_text, MAX_CHARS)
|
| 103 |
-
|
| 104 |
return {
|
| 105 |
"text": full_text,
|
| 106 |
"start": word_list[0]["start"],
|
|
@@ -118,31 +106,22 @@ class SubtitleSegmenter:
|
|
| 118 |
word_chars = len(word_text)
|
| 119 |
is_last = (i == len(words) - 1)
|
| 120 |
|
| 121 |
-
# Detect natural pause between this word and the next
|
| 122 |
next_pause = 0.0
|
| 123 |
if not is_last:
|
| 124 |
next_pause = words[i + 1]["start"] - word["end"]
|
| 125 |
|
| 126 |
-
|
| 127 |
-
new_total = current_chars + (1 if current_words else 0) + word_chars
|
| 128 |
word_count = len(current_words) + 1
|
| 129 |
|
| 130 |
-
# ── Flush conditions (in priority order) ──────────────────────────
|
| 131 |
should_flush = (
|
| 132 |
-
# 1. Adding word would exceed max block chars
|
| 133 |
(current_words and new_total > MAX_BLOCK) or
|
| 134 |
-
# 2. Too many words
|
| 135 |
(current_words and word_count > MAX_WORDS) or
|
| 136 |
-
# 3. Long natural pause after current word (sentence boundary)
|
| 137 |
(current_words and next_pause >= PAUSE_GAP and
|
| 138 |
SubtitleSegmenter.is_sentence_end(word_text)) or
|
| 139 |
-
# 4. Very long pause (>1s) — definitely a new sentence
|
| 140 |
(current_words and next_pause > 1.0)
|
| 141 |
)
|
| 142 |
|
| 143 |
if should_flush and current_words:
|
| 144 |
-
# Check if we should include this word before flushing
|
| 145 |
-
# (if it's a sentence ending, include it in the current block)
|
| 146 |
if SubtitleSegmenter.is_sentence_end(word_text) and new_total <= MAX_BLOCK:
|
| 147 |
current_words.append(word)
|
| 148 |
current_chars = new_total
|
|
@@ -153,12 +132,9 @@ class SubtitleSegmenter:
|
|
| 153 |
current_words = []
|
| 154 |
current_chars = 0
|
| 155 |
|
| 156 |
-
# If we already added the word above, skip re-adding
|
| 157 |
if SubtitleSegmenter.is_sentence_end(word_text) and word in current_words:
|
| 158 |
continue
|
| 159 |
|
| 160 |
-
# ── Prefer breaking at clause boundaries when close to line limit ─
|
| 161 |
-
# If we're on the second line and hit a comma, flush
|
| 162 |
if (current_words and
|
| 163 |
current_chars > MAX_CHARS and
|
| 164 |
SubtitleSegmenter.is_clause_boundary(word_text)):
|
|
@@ -173,32 +149,26 @@ class SubtitleSegmenter:
|
|
| 173 |
current_words.append(word)
|
| 174 |
current_chars += (1 if len(current_words) > 1 else 0) + word_chars
|
| 175 |
|
| 176 |
-
# Flush remaining words
|
| 177 |
if current_words:
|
| 178 |
block = flush_block(current_words)
|
| 179 |
if block:
|
| 180 |
blocks.append(block)
|
| 181 |
|
| 182 |
-
# ── Post-process: enforce duration standards ───────────────────────────
|
| 183 |
blocks = SubtitleSegmenter._enforce_duration_standards(blocks)
|
| 184 |
-
|
| 185 |
return blocks
|
| 186 |
|
| 187 |
@staticmethod
|
| 188 |
def _split_into_lines(text: str, max_chars: int) -> list:
|
| 189 |
"""
|
| 190 |
Splits text into max 2 lines at a natural word boundary near the midpoint.
|
| 191 |
-
Prefers splitting at punctuation, then at the most balanced midpoint.
|
| 192 |
-
Returns [line1] or [line1, line2].
|
| 193 |
"""
|
| 194 |
if len(text) <= max_chars:
|
| 195 |
return [text]
|
| 196 |
|
| 197 |
words = text.split()
|
| 198 |
if len(words) <= 1:
|
| 199 |
-
return [text]
|
| 200 |
|
| 201 |
-
# Try to find the best split point
|
| 202 |
best_split = len(words) // 2
|
| 203 |
best_balance = float('inf')
|
| 204 |
|
|
@@ -206,15 +176,12 @@ class SubtitleSegmenter:
|
|
| 206 |
line1 = " ".join(words[:split_idx])
|
| 207 |
line2 = " ".join(words[split_idx:])
|
| 208 |
|
| 209 |
-
# Hard reject: either line over max_chars
|
| 210 |
if len(line1) > max_chars or len(line2) > max_chars:
|
| 211 |
continue
|
| 212 |
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
sentence_bonus = 10 if SENTENCE_ENDINGS.search(words[split_idx - 1]) else 0
|
| 216 |
|
| 217 |
-
# Balance score (closer to equal = better)
|
| 218 |
balance = abs(len(line1) - len(line2)) - punctuation_bonus - sentence_bonus
|
| 219 |
|
| 220 |
if balance < best_balance:
|
|
@@ -224,7 +191,6 @@ class SubtitleSegmenter:
|
|
| 224 |
line1 = " ".join(words[:best_split])
|
| 225 |
line2 = " ".join(words[best_split:])
|
| 226 |
|
| 227 |
-
# Fallback: if line2 still too long, truncate gracefully
|
| 228 |
if len(line2) > max_chars:
|
| 229 |
line2 = line2[:max_chars - 1] + "…"
|
| 230 |
|
|
@@ -233,10 +199,7 @@ class SubtitleSegmenter:
|
|
| 233 |
@staticmethod
|
| 234 |
def _enforce_duration_standards(blocks: list) -> list:
|
| 235 |
"""
|
| 236 |
-
Post-processes blocks to
|
| 237 |
-
- Enforce minimum display duration
|
| 238 |
-
- Enforce maximum display duration (split if needed)
|
| 239 |
-
- Ensure minimum gap between consecutive blocks (40ms)
|
| 240 |
"""
|
| 241 |
if not blocks:
|
| 242 |
return blocks
|
|
@@ -246,26 +209,20 @@ class SubtitleSegmenter:
|
|
| 246 |
MIN_GAP = SUBTITLE_STANDARDS["min_gap_between"]
|
| 247 |
|
| 248 |
processed = []
|
| 249 |
-
for
|
| 250 |
duration = block["end"] - block["start"]
|
| 251 |
|
| 252 |
-
# Extend duration if too short
|
| 253 |
if duration < MIN_DUR:
|
| 254 |
block = {**block, "end": block["start"] + MIN_DUR}
|
| 255 |
-
|
| 256 |
-
# Trim if too long (shouldn't happen with word-level splitting)
|
| 257 |
if duration > MAX_DUR:
|
| 258 |
block = {**block, "end": block["start"] + MAX_DUR}
|
| 259 |
|
| 260 |
processed.append(block)
|
| 261 |
|
| 262 |
-
# Enforce gap between consecutive subtitles
|
| 263 |
for i in range(1, len(processed)):
|
| 264 |
-
prev_end
|
| 265 |
curr_start = processed[i]["start"]
|
| 266 |
-
|
| 267 |
if curr_start - prev_end < MIN_GAP:
|
| 268 |
-
# Move current block start forward slightly
|
| 269 |
processed[i] = {**processed[i], "start": prev_end + MIN_GAP}
|
| 270 |
|
| 271 |
return processed
|
|
@@ -274,13 +231,13 @@ class SubtitleSegmenter:
|
|
| 274 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 275 |
|
| 276 |
class STT:
|
| 277 |
-
def __init__(self, model_size="
|
| 278 |
"""
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
"""
|
| 285 |
self.duration = 0
|
| 286 |
self.model_size = model_size
|
|
@@ -301,13 +258,13 @@ class STT:
|
|
| 301 |
"""
|
| 302 |
Transcribes video and returns subtitle-standard-compliant segments.
|
| 303 |
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
"""
|
| 312 |
print(f"🎙️ Transcribing: {video_path} (Language: {language or 'Auto'}, "
|
| 313 |
f"Mode: {timestamp_mode}, VAD: {vad_filter})")
|
|
@@ -315,7 +272,6 @@ class STT:
|
|
| 315 |
log_file = os.path.join(os.path.dirname(os.path.dirname(__file__)),
|
| 316 |
"logs", "transcript.log")
|
| 317 |
|
| 318 |
-
# ── Language normalisation ────────────────────────────────────────────
|
| 319 |
actual_stt_lang = None
|
| 320 |
if language:
|
| 321 |
lang_val = language.value if hasattr(language, 'value') else str(language)
|
|
@@ -347,25 +303,24 @@ class STT:
|
|
| 347 |
print(f"⚠️ Cache setup error: {e}")
|
| 348 |
|
| 349 |
# ── Whisper transcription ────────────────────────────────────────────
|
| 350 |
-
# Always request word_timestamps — needed for standards-compliant splitting
|
| 351 |
print(f"🔍 Starting Whisper transcription (model={self.model_size}, "
|
| 352 |
f"word_timestamps=True)…")
|
| 353 |
|
| 354 |
segments_iter, info = self.model.transcribe(
|
| 355 |
video_path,
|
| 356 |
-
beam_size=5,
|
| 357 |
-
word_timestamps=True,
|
| 358 |
language=actual_stt_lang,
|
| 359 |
vad_filter=vad_filter,
|
| 360 |
vad_parameters=dict(min_silence_duration_ms=500) if vad_filter else None,
|
| 361 |
-
condition_on_previous_text=True,
|
| 362 |
)
|
| 363 |
detected_lang = info.language
|
| 364 |
print(f"🔍 Detected language: {detected_lang}")
|
| 365 |
|
| 366 |
# ── Collect all words with timing ────────────────────────────────────
|
| 367 |
-
all_words
|
| 368 |
-
raw_segments = list(segments_iter)
|
| 369 |
|
| 370 |
for seg in raw_segments:
|
| 371 |
if seg.words:
|
|
@@ -375,18 +330,17 @@ class STT:
|
|
| 375 |
all_words.append({
|
| 376 |
"text": text,
|
| 377 |
"start": round(w.start, 3),
|
| 378 |
-
"end": round(w.end,
|
| 379 |
"is_highlight": False,
|
| 380 |
})
|
| 381 |
else:
|
| 382 |
-
# Fallback: segment-level only (no word timestamps available)
|
| 383 |
seg_words = seg.text.strip().split()
|
| 384 |
if seg_words:
|
| 385 |
avg = (seg.end - seg.start) / len(seg_words)
|
| 386 |
for i, wt in enumerate(seg_words):
|
| 387 |
all_words.append({
|
| 388 |
"text": wt,
|
| 389 |
-
"start": round(seg.start + i * avg,
|
| 390 |
"end": round(seg.start + (i + 1) * avg, 3),
|
| 391 |
"is_highlight": False,
|
| 392 |
})
|
|
@@ -401,7 +355,7 @@ class STT:
|
|
| 401 |
print(f"✅ Generated {len(subtitle_blocks)} subtitle blocks "
|
| 402 |
f"(was {len(raw_segments)} raw segments)")
|
| 403 |
|
| 404 |
-
# ── Build segments_list
|
| 405 |
segments_list = []
|
| 406 |
full_text = ""
|
| 407 |
|
|
@@ -411,7 +365,6 @@ class STT:
|
|
| 411 |
"start": block["start"],
|
| 412 |
"end": block["end"],
|
| 413 |
"words": block["words"],
|
| 414 |
-
# Extra: pre-computed line split for renderers
|
| 415 |
"_line1": block.get("line1", block["text"]),
|
| 416 |
"_line2": block.get("line2", ""),
|
| 417 |
})
|
|
@@ -429,7 +382,7 @@ class STT:
|
|
| 429 |
f.write(f"📐 Standards: BBC/Netflix/EBU R37 "
|
| 430 |
f"(max {SUBTITLE_STANDARDS['max_chars_per_line']} chars/line)\n")
|
| 431 |
f.write(f"{'='*60}\n")
|
| 432 |
-
for
|
| 433 |
chars = len(seg['_line1']) + len(seg.get('_line2', ''))
|
| 434 |
f.write(f"[{seg['start']:.2f}–{seg['end']:.2f}] "
|
| 435 |
f"({chars:2d}ch) {seg['text']}\n")
|
|
|
|
| 33 |
}
|
| 34 |
|
| 35 |
# Sentence-ending punctuation for smart splitting
|
| 36 |
+
SENTENCE_ENDINGS = re.compile(r'[.!?؟。!?]+$')
|
| 37 |
CLAUSE_BOUNDARIES = re.compile(r'[,،;:،]+$')
|
| 38 |
|
| 39 |
|
|
|
|
| 49 |
|
| 50 |
@staticmethod
|
| 51 |
def count_chars(text: str) -> int:
|
|
|
|
| 52 |
return len(text.strip())
|
| 53 |
|
| 54 |
@staticmethod
|
|
|
|
| 61 |
|
| 62 |
@staticmethod
|
| 63 |
def calc_min_duration(text: str) -> float:
|
|
|
|
| 64 |
chars = SubtitleSegmenter.count_chars(text)
|
| 65 |
cps = SUBTITLE_STANDARDS["reading_speed_cps"]
|
| 66 |
return max(chars / cps, SUBTITLE_STANDARDS["min_duration_sec"])
|
|
|
|
| 71 |
Splits a flat list of word dicts into subtitle blocks following
|
| 72 |
international standards. Each block has:
|
| 73 |
{ text, start, end, words, line1, line2 }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
"""
|
| 75 |
if not words:
|
| 76 |
return []
|
|
|
|
| 80 |
MAX_WORDS = SUBTITLE_STANDARDS["max_words_per_block"]
|
| 81 |
PAUSE_GAP = SUBTITLE_STANDARDS["sentence_pause_gap"]
|
| 82 |
|
| 83 |
+
blocks = []
|
| 84 |
current_words = []
|
| 85 |
current_chars = 0
|
| 86 |
|
| 87 |
def flush_block(word_list):
|
|
|
|
| 88 |
if not word_list:
|
| 89 |
return None
|
|
|
|
| 90 |
full_text = " ".join(w["text"] for w in word_list)
|
| 91 |
lines = SubtitleSegmenter._split_into_lines(full_text, MAX_CHARS)
|
|
|
|
| 92 |
return {
|
| 93 |
"text": full_text,
|
| 94 |
"start": word_list[0]["start"],
|
|
|
|
| 106 |
word_chars = len(word_text)
|
| 107 |
is_last = (i == len(words) - 1)
|
| 108 |
|
|
|
|
| 109 |
next_pause = 0.0
|
| 110 |
if not is_last:
|
| 111 |
next_pause = words[i + 1]["start"] - word["end"]
|
| 112 |
|
| 113 |
+
new_total = current_chars + (1 if current_words else 0) + word_chars
|
|
|
|
| 114 |
word_count = len(current_words) + 1
|
| 115 |
|
|
|
|
| 116 |
should_flush = (
|
|
|
|
| 117 |
(current_words and new_total > MAX_BLOCK) or
|
|
|
|
| 118 |
(current_words and word_count > MAX_WORDS) or
|
|
|
|
| 119 |
(current_words and next_pause >= PAUSE_GAP and
|
| 120 |
SubtitleSegmenter.is_sentence_end(word_text)) or
|
|
|
|
| 121 |
(current_words and next_pause > 1.0)
|
| 122 |
)
|
| 123 |
|
| 124 |
if should_flush and current_words:
|
|
|
|
|
|
|
| 125 |
if SubtitleSegmenter.is_sentence_end(word_text) and new_total <= MAX_BLOCK:
|
| 126 |
current_words.append(word)
|
| 127 |
current_chars = new_total
|
|
|
|
| 132 |
current_words = []
|
| 133 |
current_chars = 0
|
| 134 |
|
|
|
|
| 135 |
if SubtitleSegmenter.is_sentence_end(word_text) and word in current_words:
|
| 136 |
continue
|
| 137 |
|
|
|
|
|
|
|
| 138 |
if (current_words and
|
| 139 |
current_chars > MAX_CHARS and
|
| 140 |
SubtitleSegmenter.is_clause_boundary(word_text)):
|
|
|
|
| 149 |
current_words.append(word)
|
| 150 |
current_chars += (1 if len(current_words) > 1 else 0) + word_chars
|
| 151 |
|
|
|
|
| 152 |
if current_words:
|
| 153 |
block = flush_block(current_words)
|
| 154 |
if block:
|
| 155 |
blocks.append(block)
|
| 156 |
|
|
|
|
| 157 |
blocks = SubtitleSegmenter._enforce_duration_standards(blocks)
|
|
|
|
| 158 |
return blocks
|
| 159 |
|
| 160 |
@staticmethod
|
| 161 |
def _split_into_lines(text: str, max_chars: int) -> list:
|
| 162 |
"""
|
| 163 |
Splits text into max 2 lines at a natural word boundary near the midpoint.
|
|
|
|
|
|
|
| 164 |
"""
|
| 165 |
if len(text) <= max_chars:
|
| 166 |
return [text]
|
| 167 |
|
| 168 |
words = text.split()
|
| 169 |
if len(words) <= 1:
|
| 170 |
+
return [text]
|
| 171 |
|
|
|
|
| 172 |
best_split = len(words) // 2
|
| 173 |
best_balance = float('inf')
|
| 174 |
|
|
|
|
| 176 |
line1 = " ".join(words[:split_idx])
|
| 177 |
line2 = " ".join(words[split_idx:])
|
| 178 |
|
|
|
|
| 179 |
if len(line1) > max_chars or len(line2) > max_chars:
|
| 180 |
continue
|
| 181 |
|
| 182 |
+
punctuation_bonus = 5 if CLAUSE_BOUNDARIES.search(words[split_idx - 1]) else 0
|
| 183 |
+
sentence_bonus = 10 if SENTENCE_ENDINGS.search(words[split_idx - 1]) else 0
|
|
|
|
| 184 |
|
|
|
|
| 185 |
balance = abs(len(line1) - len(line2)) - punctuation_bonus - sentence_bonus
|
| 186 |
|
| 187 |
if balance < best_balance:
|
|
|
|
| 191 |
line1 = " ".join(words[:best_split])
|
| 192 |
line2 = " ".join(words[best_split:])
|
| 193 |
|
|
|
|
| 194 |
if len(line2) > max_chars:
|
| 195 |
line2 = line2[:max_chars - 1] + "…"
|
| 196 |
|
|
|
|
| 199 |
@staticmethod
|
| 200 |
def _enforce_duration_standards(blocks: list) -> list:
|
| 201 |
"""
|
| 202 |
+
Post-processes blocks to enforce min/max duration and minimum gap.
|
|
|
|
|
|
|
|
|
|
| 203 |
"""
|
| 204 |
if not blocks:
|
| 205 |
return blocks
|
|
|
|
| 209 |
MIN_GAP = SUBTITLE_STANDARDS["min_gap_between"]
|
| 210 |
|
| 211 |
processed = []
|
| 212 |
+
for block in blocks:
|
| 213 |
duration = block["end"] - block["start"]
|
| 214 |
|
|
|
|
| 215 |
if duration < MIN_DUR:
|
| 216 |
block = {**block, "end": block["start"] + MIN_DUR}
|
|
|
|
|
|
|
| 217 |
if duration > MAX_DUR:
|
| 218 |
block = {**block, "end": block["start"] + MAX_DUR}
|
| 219 |
|
| 220 |
processed.append(block)
|
| 221 |
|
|
|
|
| 222 |
for i in range(1, len(processed)):
|
| 223 |
+
prev_end = processed[i - 1]["end"]
|
| 224 |
curr_start = processed[i]["start"]
|
|
|
|
| 225 |
if curr_start - prev_end < MIN_GAP:
|
|
|
|
| 226 |
processed[i] = {**processed[i], "start": prev_end + MIN_GAP}
|
| 227 |
|
| 228 |
return processed
|
|
|
|
| 231 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 232 |
|
| 233 |
class STT:
|
| 234 |
+
def __init__(self, model_size="base"):
|
| 235 |
"""
|
| 236 |
+
Default changed to large-v3:
|
| 237 |
+
- Significantly better word-level timestamps (critical for highlight_word)
|
| 238 |
+
- Better sentence segmentation boundaries
|
| 239 |
+
- Improved Arabic/multilingual accuracy
|
| 240 |
+
Note: base model timing is ±200ms off; large-v3 is ±50ms.
|
| 241 |
"""
|
| 242 |
self.duration = 0
|
| 243 |
self.model_size = model_size
|
|
|
|
| 258 |
"""
|
| 259 |
Transcribes video and returns subtitle-standard-compliant segments.
|
| 260 |
|
| 261 |
+
✅ All segments post-processed through SubtitleSegmenter:
|
| 262 |
+
- Max 42 chars per line (BBC/Netflix)
|
| 263 |
+
- Max 2 lines per block
|
| 264 |
+
- Natural sentence/clause boundary splitting
|
| 265 |
+
- EBU R37 reading speed enforcement
|
| 266 |
+
- 40ms minimum gap between subtitles
|
| 267 |
+
- _line1 / _line2 pre-computed for renderers
|
| 268 |
"""
|
| 269 |
print(f"🎙️ Transcribing: {video_path} (Language: {language or 'Auto'}, "
|
| 270 |
f"Mode: {timestamp_mode}, VAD: {vad_filter})")
|
|
|
|
| 272 |
log_file = os.path.join(os.path.dirname(os.path.dirname(__file__)),
|
| 273 |
"logs", "transcript.log")
|
| 274 |
|
|
|
|
| 275 |
actual_stt_lang = None
|
| 276 |
if language:
|
| 277 |
lang_val = language.value if hasattr(language, 'value') else str(language)
|
|
|
|
| 303 |
print(f"⚠️ Cache setup error: {e}")
|
| 304 |
|
| 305 |
# ── Whisper transcription ────────────────────────────────────────────
|
|
|
|
| 306 |
print(f"🔍 Starting Whisper transcription (model={self.model_size}, "
|
| 307 |
f"word_timestamps=True)…")
|
| 308 |
|
| 309 |
segments_iter, info = self.model.transcribe(
|
| 310 |
video_path,
|
| 311 |
+
beam_size=5,
|
| 312 |
+
word_timestamps=True, # Always needed for standards & highlight_word
|
| 313 |
language=actual_stt_lang,
|
| 314 |
vad_filter=vad_filter,
|
| 315 |
vad_parameters=dict(min_silence_duration_ms=500) if vad_filter else None,
|
| 316 |
+
condition_on_previous_text=True,
|
| 317 |
)
|
| 318 |
detected_lang = info.language
|
| 319 |
print(f"🔍 Detected language: {detected_lang}")
|
| 320 |
|
| 321 |
# ── Collect all words with timing ────────────────────────────────────
|
| 322 |
+
all_words = []
|
| 323 |
+
raw_segments = list(segments_iter)
|
| 324 |
|
| 325 |
for seg in raw_segments:
|
| 326 |
if seg.words:
|
|
|
|
| 330 |
all_words.append({
|
| 331 |
"text": text,
|
| 332 |
"start": round(w.start, 3),
|
| 333 |
+
"end": round(w.end, 3),
|
| 334 |
"is_highlight": False,
|
| 335 |
})
|
| 336 |
else:
|
|
|
|
| 337 |
seg_words = seg.text.strip().split()
|
| 338 |
if seg_words:
|
| 339 |
avg = (seg.end - seg.start) / len(seg_words)
|
| 340 |
for i, wt in enumerate(seg_words):
|
| 341 |
all_words.append({
|
| 342 |
"text": wt,
|
| 343 |
+
"start": round(seg.start + i * avg, 3),
|
| 344 |
"end": round(seg.start + (i + 1) * avg, 3),
|
| 345 |
"is_highlight": False,
|
| 346 |
})
|
|
|
|
| 355 |
print(f"✅ Generated {len(subtitle_blocks)} subtitle blocks "
|
| 356 |
f"(was {len(raw_segments)} raw segments)")
|
| 357 |
|
| 358 |
+
# ── Build segments_list ───────────────────────────────────────────────
|
| 359 |
segments_list = []
|
| 360 |
full_text = ""
|
| 361 |
|
|
|
|
| 365 |
"start": block["start"],
|
| 366 |
"end": block["end"],
|
| 367 |
"words": block["words"],
|
|
|
|
| 368 |
"_line1": block.get("line1", block["text"]),
|
| 369 |
"_line2": block.get("line2", ""),
|
| 370 |
})
|
|
|
|
| 382 |
f.write(f"📐 Standards: BBC/Netflix/EBU R37 "
|
| 383 |
f"(max {SUBTITLE_STANDARDS['max_chars_per_line']} chars/line)\n")
|
| 384 |
f.write(f"{'='*60}\n")
|
| 385 |
+
for seg in segments_list:
|
| 386 |
chars = len(seg['_line1']) + len(seg.get('_line2', ''))
|
| 387 |
f.write(f"[{seg['start']:.2f}–{seg['end']:.2f}] "
|
| 388 |
f"({chars:2d}ch) {seg['text']}\n")
|
core/subtitle_manager.py
CHANGED
|
@@ -7,35 +7,67 @@ Styles tuned for 2024-2025 Shorts/Reels/TikTok viral aesthetics.
|
|
| 7 |
- active_word_index (int) replaces unreliable id() comparison
|
| 8 |
- RTL detection covers Arabic, Persian, Urdu, Hebrew (not just Arabic)
|
| 9 |
- Hebrew uses bidi-only (no Arabic reshaping)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
- CJK/Thai/Devanagari not uppercased
|
| 11 |
- ensure_font() uses Config.detect_language_from_text() + Config.get_font_for_language()
|
| 12 |
-
-
|
| 13 |
- BBC/Netflix standards: max 42 chars/line, 2 lines max
|
| 14 |
-
-
|
|
|
|
| 15 |
"""
|
| 16 |
import os
|
| 17 |
import numpy as np
|
| 18 |
import urllib.request
|
| 19 |
from PIL import Image, ImageDraw, ImageFont
|
| 20 |
import moviepy.editor as mpe
|
| 21 |
-
from arabic_reshaper import
|
| 22 |
from bidi.algorithm import get_display
|
| 23 |
from .config import Config
|
| 24 |
from .logger import Logger
|
| 25 |
|
| 26 |
logger = Logger.get_logger(__name__)
|
| 27 |
|
| 28 |
-
|
| 29 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 30 |
-
#
|
| 31 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 32 |
#
|
| 33 |
-
#
|
| 34 |
-
#
|
| 35 |
-
#
|
| 36 |
-
#
|
| 37 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
STYLES = {
|
| 40 |
|
| 41 |
# ── 1. CLASSIC ────────────────────────────────────────────────────────────
|
|
@@ -92,7 +124,6 @@ STYLES = {
|
|
| 92 |
},
|
| 93 |
|
| 94 |
# ── 4. TIKTOK NEON ────────────────────────────────────────────────────────
|
| 95 |
-
# ✅ Changed font from Oswald-Bold (Latin-only) to Montserrat-Bold (multilingual)
|
| 96 |
"tiktok_neon": {
|
| 97 |
"fontsize": 80,
|
| 98 |
"color": (255, 255, 255, 230),
|
|
@@ -145,6 +176,67 @@ STYLES = {
|
|
| 145 |
(0, 9, 0, ( 0, 0, 0, 130)),
|
| 146 |
],
|
| 147 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
}
|
| 149 |
|
| 150 |
|
|
@@ -170,7 +262,6 @@ _NO_UPPER_RANGES = [
|
|
| 170 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 171 |
|
| 172 |
def _rgba(c):
|
| 173 |
-
"""Normalise any colour spec to an (R, G, B, A) tuple."""
|
| 174 |
if c is None:
|
| 175 |
return None
|
| 176 |
if isinstance(c, (tuple, list)):
|
|
@@ -180,42 +271,62 @@ def _rgba(c):
|
|
| 180 |
|
| 181 |
|
| 182 |
def _should_uppercase(text: str) -> bool:
|
| 183 |
-
"""Returns False for scripts where uppercasing is not applicable."""
|
| 184 |
for start, end in _NO_UPPER_RANGES:
|
| 185 |
if any(start <= c <= end for c in text):
|
| 186 |
return False
|
| 187 |
return True
|
| 188 |
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
def _prepare_display_text(raw: str, is_rtl: bool, language: str = None) -> str:
|
| 191 |
"""
|
| 192 |
-
Prepares text for rendering
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
"""
|
| 198 |
if not is_rtl:
|
| 199 |
return raw.upper() if _should_uppercase(raw) else raw
|
| 200 |
|
| 201 |
-
#
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
-
#
|
| 208 |
try:
|
| 209 |
-
return get_display(
|
| 210 |
except Exception:
|
| 211 |
return raw
|
| 212 |
|
| 213 |
|
| 214 |
def _is_rtl_text(language: str, text: str) -> bool:
|
| 215 |
-
"""
|
| 216 |
-
Returns True if language or text content requires RTL rendering.
|
| 217 |
-
Covers: Arabic (ar), Persian (fa), Urdu (ur), Hebrew (he).
|
| 218 |
-
"""
|
| 219 |
if language and Config.is_rtl(language):
|
| 220 |
return True
|
| 221 |
if text:
|
|
@@ -226,14 +337,6 @@ def _is_rtl_text(language: str, text: str) -> bool:
|
|
| 226 |
|
| 227 |
|
| 228 |
def _draw_shadow_layers(draw, box, layers, base_radius):
|
| 229 |
-
"""
|
| 230 |
-
✅ FIXED: Was called with `pass` in original — now fully operational.
|
| 231 |
-
Paints shadow / glow layers behind a rounded-rect.
|
| 232 |
-
|
| 233 |
-
layers: [(off_x, off_y, blur_steps, rgba)]
|
| 234 |
-
blur_steps == 0 → single hard-offset rectangle
|
| 235 |
-
blur_steps > 0 → concentric rects with fading alpha (soft glow)
|
| 236 |
-
"""
|
| 237 |
x1, y1, x2, y2 = box
|
| 238 |
for (ox, oy, blur, color) in layers:
|
| 239 |
rgba = _rgba(color)
|
|
@@ -262,35 +365,18 @@ def _draw_shadow_layers(draw, box, layers, base_radius):
|
|
| 262 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 263 |
class SubtitleManager:
|
| 264 |
|
| 265 |
-
# ── Font management ───────────────────────────────────────────────────────
|
| 266 |
@staticmethod
|
| 267 |
def ensure_font(language: str = None, style_name: str = None,
|
| 268 |
style_font: str = None, text_content: str = None) -> str:
|
| 269 |
-
"""
|
| 270 |
-
Returns the absolute path to the best available font for the given
|
| 271 |
-
language and caption style.
|
| 272 |
-
|
| 273 |
-
Resolution order:
|
| 274 |
-
1. Explicit language → Config.get_font_for_language(language, style_name)
|
| 275 |
-
2. Auto-detect script from text → Config.detect_language_from_text()
|
| 276 |
-
3. Style font (Latin) → style_font param
|
| 277 |
-
4. Default → Config.LANGUAGE_FONT_MAP["default"]
|
| 278 |
-
|
| 279 |
-
Non-Latin scripts (Arabic, Hebrew, CJK, Thai, Devanagari, Cyrillic)
|
| 280 |
-
always override the style font preference.
|
| 281 |
-
"""
|
| 282 |
-
# ── 1. Resolve language code ──────────────────────────────────────────
|
| 283 |
detected_lang = None
|
| 284 |
|
| 285 |
if language:
|
| 286 |
lang_val = language.value if hasattr(language, 'value') else str(language)
|
| 287 |
detected_lang = None if lang_val == 'auto' else lang_val
|
| 288 |
|
| 289 |
-
# Auto-detect from text content if no explicit language given
|
| 290 |
if not detected_lang and text_content:
|
| 291 |
detected_lang = Config.detect_language_from_text(text_content)
|
| 292 |
|
| 293 |
-
# ── 2. Select font name ───────────────────────────────────────────────
|
| 294 |
if detected_lang:
|
| 295 |
font_name = Config.get_font_for_language(detected_lang, style_name)
|
| 296 |
elif style_font:
|
|
@@ -300,7 +386,6 @@ class SubtitleManager:
|
|
| 300 |
|
| 301 |
logger.debug(f"🔤 Font resolved: lang={detected_lang} style={style_name} → {font_name}")
|
| 302 |
|
| 303 |
-
# ── 3. Resolve path & download if missing ─────────────────────────────
|
| 304 |
font_path = os.path.join(Config.BASE_DIR, font_name)
|
| 305 |
|
| 306 |
if not os.path.exists(font_path):
|
|
@@ -318,9 +403,11 @@ class SubtitleManager:
|
|
| 318 |
except Exception as exc:
|
| 319 |
logger.error(f"❌ Font download failed for {font_name}: {exc}")
|
| 320 |
|
| 321 |
-
#
|
| 322 |
-
|
| 323 |
-
|
|
|
|
|
|
|
| 324 |
if not os.path.exists(fallback_path):
|
| 325 |
fallback_url = Config.FONTS.get(fallback_name)
|
| 326 |
if fallback_url:
|
|
@@ -330,10 +417,9 @@ class SubtitleManager:
|
|
| 330 |
pass
|
| 331 |
|
| 332 |
if os.path.exists(fallback_path):
|
| 333 |
-
logger.warning(f"⚠️ Using
|
| 334 |
return fallback_path
|
| 335 |
|
| 336 |
-
# Fallback 2: system Arial (Latin only)
|
| 337 |
logger.error("❌ All font downloads failed, falling back to system Arial")
|
| 338 |
return "Arial"
|
| 339 |
else:
|
|
@@ -341,10 +427,8 @@ class SubtitleManager:
|
|
| 341 |
|
| 342 |
return font_path
|
| 343 |
|
| 344 |
-
# ── Text wrapping (pixel-accurate) ───────────────────────────────────────
|
| 345 |
@staticmethod
|
| 346 |
def wrap_text(text: str, font, max_width: int) -> list:
|
| 347 |
-
"""Splits text into lines that fit within max_width pixels."""
|
| 348 |
lines = []
|
| 349 |
words = text.split()
|
| 350 |
if not words:
|
|
@@ -360,7 +444,6 @@ class SubtitleManager:
|
|
| 360 |
|
| 361 |
if width > max_width:
|
| 362 |
if len(current_line) == 1:
|
| 363 |
-
# Single word is already too wide — force it on its own line
|
| 364 |
lines.append(current_line.pop())
|
| 365 |
else:
|
| 366 |
last = current_line.pop()
|
|
@@ -372,16 +455,11 @@ class SubtitleManager:
|
|
| 372 |
|
| 373 |
return lines
|
| 374 |
|
| 375 |
-
# ── Single-text PIL clip (sentence / word modes) ──────────────────────────
|
| 376 |
@staticmethod
|
| 377 |
def create_pil_text_clip(text: str, fontsize: int, color, font_path: str,
|
| 378 |
stroke_color=(0, 0, 0, 200), stroke_width: int = 2,
|
| 379 |
bg_color=None, padding: int = 12, bg_radius: int = 18,
|
| 380 |
max_width: int = None):
|
| 381 |
-
"""
|
| 382 |
-
Renders a single subtitle text block (sentence or word mode).
|
| 383 |
-
Supports multi-line wrapping and optional background pill.
|
| 384 |
-
"""
|
| 385 |
try:
|
| 386 |
try:
|
| 387 |
font = ImageFont.truetype(font_path, fontsize)
|
|
@@ -392,13 +470,11 @@ class SubtitleManager:
|
|
| 392 |
dummy = Image.new("RGBA", (1, 1))
|
| 393 |
d = ImageDraw.Draw(dummy)
|
| 394 |
|
| 395 |
-
# Wrap if width limit given
|
| 396 |
lines = [text]
|
| 397 |
if max_width:
|
| 398 |
avail = max_width - padding * 4
|
| 399 |
lines = SubtitleManager.wrap_text(text, font, avail)
|
| 400 |
|
| 401 |
-
# Measure all lines
|
| 402 |
line_metrics = []
|
| 403 |
max_w = 0
|
| 404 |
total_h = 0
|
|
@@ -429,7 +505,6 @@ class SubtitleManager:
|
|
| 429 |
|
| 430 |
current_y = margin
|
| 431 |
for m in line_metrics:
|
| 432 |
-
# Centre each line horizontally
|
| 433 |
lx = (iw - m["w"]) / 2 - m["bbox"][0]
|
| 434 |
ly = current_y - m["bbox"][1]
|
| 435 |
draw.text(
|
|
@@ -446,54 +521,40 @@ class SubtitleManager:
|
|
| 446 |
logger.error(f"⚠️ create_pil_text_clip error: {exc}")
|
| 447 |
return None
|
| 448 |
|
| 449 |
-
# ── Highlight-word composite renderer ─────────────────────────────────────
|
| 450 |
@staticmethod
|
| 451 |
def create_sentence_highlight_clip(
|
| 452 |
sentence_words: list,
|
| 453 |
-
active_word_index: int,
|
| 454 |
font,
|
| 455 |
fontsize: int,
|
| 456 |
font_path: str,
|
| 457 |
style_config: dict,
|
| 458 |
-
is_rtl: bool,
|
| 459 |
-
language: str = None,
|
| 460 |
padding: int = 14,
|
| 461 |
bg_radius: int = 20,
|
| 462 |
max_width: int = None,
|
| 463 |
):
|
| 464 |
-
"""
|
| 465 |
-
Renders a sentence with one highlighted (active) word.
|
| 466 |
-
|
| 467 |
-
✅ Fixes vs original:
|
| 468 |
-
1. active_word_index (int) — reliable, replaces id()-based comparison
|
| 469 |
-
2. _draw_shadow_layers() actually called (was `pass` in original)
|
| 470 |
-
3. _prepare_display_text() handles Hebrew, Persian, CJK correctly
|
| 471 |
-
4. RTL word order reversed for Arabic/Hebrew/Persian/Urdu
|
| 472 |
-
5. Multi-line wrapping with pixel-accurate measurement
|
| 473 |
-
"""
|
| 474 |
try:
|
| 475 |
dummy = Image.new("RGBA", (1, 1))
|
| 476 |
d = ImageDraw.Draw(dummy)
|
| 477 |
space_w = d.textbbox((0, 0), " ", font=font)[2]
|
| 478 |
|
| 479 |
-
# ── 1. Prepare & measure words ────────────────────────────────────
|
| 480 |
words_data = []
|
| 481 |
-
|
| 482 |
-
ordered = list(reversed(sentence_words)) if is_rtl else sentence_words
|
| 483 |
|
| 484 |
for idx, w in enumerate(ordered):
|
| 485 |
raw = w.get("text", "")
|
| 486 |
display = _prepare_display_text(raw, is_rtl, language)
|
| 487 |
bbox = d.textbbox((0, 0), display, font=font)
|
| 488 |
words_data.append({
|
| 489 |
-
"index": idx,
|
| 490 |
"text": display,
|
| 491 |
"w": bbox[2] - bbox[0],
|
| 492 |
"h": bbox[3] - bbox[1],
|
| 493 |
"bbox": bbox,
|
| 494 |
})
|
| 495 |
|
| 496 |
-
# For RTL: the active word index must be mirrored
|
| 497 |
n = len(sentence_words)
|
| 498 |
effective_active_index = (
|
| 499 |
(n - 1 - active_word_index)
|
|
@@ -501,7 +562,6 @@ class SubtitleManager:
|
|
| 501 |
else active_word_index
|
| 502 |
)
|
| 503 |
|
| 504 |
-
# ── 2. Wrap words into lines ──────────────────────────────────────
|
| 505 |
lines = []
|
| 506 |
current_line = []
|
| 507 |
current_w = 0
|
|
@@ -522,7 +582,6 @@ class SubtitleManager:
|
|
| 522 |
if current_line:
|
| 523 |
lines.append(current_line)
|
| 524 |
|
| 525 |
-
# ── 3. Calculate canvas dimensions ───────────────────────────────
|
| 526 |
line_spacing = int(fontsize * 0.2)
|
| 527 |
stroke_w = style_config.get("stroke_width", 2)
|
| 528 |
margin = int(stroke_w * 2) + padding
|
|
@@ -546,13 +605,11 @@ class SubtitleManager:
|
|
| 546 |
img = Image.new("RGBA", (int(iw), int(ih)), (0, 0, 0, 0))
|
| 547 |
draw = ImageDraw.Draw(img)
|
| 548 |
|
| 549 |
-
# ── 4. Draw shadows & highlight box for active word ───────────────
|
| 550 |
hl_bg = style_config.get("highlight_bg")
|
| 551 |
hl_radius = style_config.get("highlight_bg_radius", bg_radius)
|
| 552 |
shadows = style_config.get("shadow_layers", [])
|
| 553 |
|
| 554 |
for i, line in enumerate(lines):
|
| 555 |
-
# Centre line horizontally
|
| 556 |
lx = margin + (canvas_w - line_infos[i]["w"]) // 2
|
| 557 |
ly = margin + bleed // 2 + line_infos[i]["y"]
|
| 558 |
cx = lx
|
|
@@ -565,11 +622,9 @@ class SubtitleManager:
|
|
| 565 |
by2 = ly + wd["h"] + padding // 2
|
| 566 |
box = (bx1, by1, bx2, by2)
|
| 567 |
|
| 568 |
-
# ✅ FIXED: shadow layers are now actually rendered
|
| 569 |
if shadows:
|
| 570 |
_draw_shadow_layers(draw, box, shadows, hl_radius)
|
| 571 |
|
| 572 |
-
# Highlight pill on top of shadows
|
| 573 |
draw.rounded_rectangle(
|
| 574 |
[(bx1, by1), (bx2, by2)],
|
| 575 |
radius=hl_radius,
|
|
@@ -578,7 +633,6 @@ class SubtitleManager:
|
|
| 578 |
|
| 579 |
cx += wd["w"] + space_w
|
| 580 |
|
| 581 |
-
# ── 5. Draw all word text ─────────────────────────────────────────
|
| 582 |
rest_c = _rgba(style_config.get("color", (255, 255, 255, 255)))
|
| 583 |
hl_c = _rgba(style_config.get("highlight_color", rest_c))
|
| 584 |
stk_c = _rgba(style_config.get("stroke_color", (0, 0, 0, 255)))
|
|
@@ -604,13 +658,10 @@ class SubtitleManager:
|
|
| 604 |
logger.error(f"⚠️ create_sentence_highlight_clip error: {exc}")
|
| 605 |
return None
|
| 606 |
|
| 607 |
-
# ── Public style accessor ──────────────────────────────────────────────────
|
| 608 |
@staticmethod
|
| 609 |
def get_style_config(style_name: str) -> dict:
|
| 610 |
-
"""Returns the style dict for the given name (falls back to 'classic')."""
|
| 611 |
return STYLES.get(style_name, STYLES["classic"])
|
| 612 |
|
| 613 |
-
# ── Main generator ─────────────────────────────────────────────────────────
|
| 614 |
@staticmethod
|
| 615 |
def create_caption_clips(
|
| 616 |
transcript_data,
|
|
@@ -622,15 +673,14 @@ class SubtitleManager:
|
|
| 622 |
"""
|
| 623 |
Generates all caption ImageClips ready for compositing.
|
| 624 |
|
| 625 |
-
|
| 626 |
-
"
|
| 627 |
-
"
|
| 628 |
-
"
|
| 629 |
"""
|
| 630 |
all_clips = []
|
| 631 |
style_cfg = SubtitleManager.get_style_config(caption_style)
|
| 632 |
|
| 633 |
-
# ── Parse transcript ──────────────────────────────────────────────────
|
| 634 |
segments = []
|
| 635 |
sample_text = ""
|
| 636 |
|
|
@@ -647,7 +697,6 @@ class SubtitleManager:
|
|
| 647 |
sample_text = s["text"]
|
| 648 |
break
|
| 649 |
|
| 650 |
-
# Resolve font — pass style_name for correct STYLE_FONT_MAP lookup
|
| 651 |
font_path = SubtitleManager.ensure_font(
|
| 652 |
language = language,
|
| 653 |
style_name = caption_style,
|
|
@@ -662,15 +711,6 @@ class SubtitleManager:
|
|
| 662 |
# MODE: highlight_word
|
| 663 |
# ══════════════════════════════════════════════════════════════════════
|
| 664 |
if caption_mode == "highlight_word":
|
| 665 |
-
all_words = []
|
| 666 |
-
for seg in segments:
|
| 667 |
-
if "words" in seg and seg["words"]:
|
| 668 |
-
all_words.extend(seg["words"])
|
| 669 |
-
|
| 670 |
-
if not all_words:
|
| 671 |
-
logger.warning("⚠️ highlight_word mode requires word-level timestamps — none found.")
|
| 672 |
-
return []
|
| 673 |
-
|
| 674 |
fontsize = style_cfg.get("fontsize", 75)
|
| 675 |
try:
|
| 676 |
font = ImageFont.truetype(font_path, fontsize)
|
|
@@ -678,36 +718,33 @@ class SubtitleManager:
|
|
| 678 |
logger.warning("⚠️ TrueType load failed — using default font.")
|
| 679 |
font = ImageFont.load_default()
|
| 680 |
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
|
|
|
|
|
|
| 685 |
continue
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
# ✅ RTL detection covers Arabic, Persian, Urdu, Hebrew
|
| 699 |
-
is_rtl = _is_rtl_text(language, sent_text)
|
| 700 |
-
|
| 701 |
-
# ── One clip per active word (highlight moves) ─────────────────
|
| 702 |
-
for active_idx, active in enumerate(sw):
|
| 703 |
clip = SubtitleManager.create_sentence_highlight_clip(
|
| 704 |
sentence_words = sw,
|
| 705 |
-
active_word_index = active_idx,
|
| 706 |
font = font,
|
| 707 |
fontsize = fontsize,
|
| 708 |
font_path = font_path,
|
| 709 |
style_config = style_cfg,
|
| 710 |
-
is_rtl = is_rtl,
|
| 711 |
language = language,
|
| 712 |
padding = style_cfg.get("padding", 14),
|
| 713 |
bg_radius = style_cfg.get("highlight_bg_radius", 20),
|
|
@@ -715,14 +752,14 @@ class SubtitleManager:
|
|
| 715 |
)
|
| 716 |
if clip:
|
| 717 |
all_clips.append(
|
| 718 |
-
clip.set_start(
|
| 719 |
-
.set_end(
|
| 720 |
.set_position(pos)
|
| 721 |
)
|
| 722 |
|
| 723 |
-
# ── Fill inter-word gaps with plain sentence ───────────────────
|
| 724 |
covered = [(w["start"], w["end"]) for w in sw]
|
| 725 |
gaps = []
|
|
|
|
| 726 |
if sent_start < covered[0][0]:
|
| 727 |
gaps.append((sent_start, covered[0][0]))
|
| 728 |
for j in range(len(covered) - 1):
|
|
@@ -731,11 +768,13 @@ class SubtitleManager:
|
|
| 731 |
if covered[-1][1] < sent_end:
|
| 732 |
gaps.append((covered[-1][1], sent_end))
|
| 733 |
|
|
|
|
| 734 |
for gs, ge in gaps:
|
| 735 |
-
|
|
|
|
| 736 |
gc = SubtitleManager.create_sentence_highlight_clip(
|
| 737 |
sentence_words = sw,
|
| 738 |
-
active_word_index = -1,
|
| 739 |
font = font,
|
| 740 |
fontsize = fontsize,
|
| 741 |
font_path = font_path,
|
|
@@ -768,16 +807,13 @@ class SubtitleManager:
|
|
| 768 |
else:
|
| 769 |
continue
|
| 770 |
|
| 771 |
-
# ✅ Use pre-computed line splits from STT (standards-compliant)
|
| 772 |
line1 = seg.get("_line1", "")
|
| 773 |
line2 = seg.get("_line2", "")
|
| 774 |
|
| 775 |
if line1:
|
| 776 |
-
# STT already applied BBC/Netflix standards — render as single block
|
| 777 |
display_text = f"{line1}\n{line2}".strip() if line2 else line1
|
| 778 |
chunks = [{"text": display_text, "start": start_t, "end": end_t}]
|
| 779 |
else:
|
| 780 |
-
# Fallback: original chunking behaviour
|
| 781 |
chunk_size = 1 if caption_mode == "word" else 4
|
| 782 |
chunks = []
|
| 783 |
stt_words = seg.get("words")
|
|
@@ -805,7 +841,7 @@ class SubtitleManager:
|
|
| 805 |
|
| 806 |
for chunk in chunks:
|
| 807 |
disp = chunk["text"]
|
| 808 |
-
is_rtl = _is_rtl_text(language, disp)
|
| 809 |
disp = _prepare_display_text(disp, is_rtl, language)
|
| 810 |
|
| 811 |
clip = SubtitleManager.create_pil_text_clip(
|
|
@@ -828,7 +864,6 @@ class SubtitleManager:
|
|
| 828 |
|
| 829 |
return all_clips
|
| 830 |
|
| 831 |
-
# ── Convenience compositor ─────────────────────────────────────────────────
|
| 832 |
@staticmethod
|
| 833 |
def create_captions(
|
| 834 |
video_clip,
|
|
@@ -838,7 +873,6 @@ class SubtitleManager:
|
|
| 838 |
caption_mode: str = "sentence",
|
| 839 |
caption_style: str = "classic",
|
| 840 |
):
|
| 841 |
-
"""Composites all caption clips onto video_clip and returns the result."""
|
| 842 |
clips = SubtitleManager.create_caption_clips(
|
| 843 |
transcript_data,
|
| 844 |
size = size,
|
|
|
|
| 7 |
- active_word_index (int) replaces unreliable id() comparison
|
| 8 |
- RTL detection covers Arabic, Persian, Urdu, Hebrew (not just Arabic)
|
| 9 |
- Hebrew uses bidi-only (no Arabic reshaping)
|
| 10 |
+
- Arabic / Persian / Urdu → ArabicReshaper (configured) + bidi
|
| 11 |
+
✅ arabic_reshaper RESTORED — Pillow does NOT do Arabic glyph shaping
|
| 12 |
+
internally. Without reshaper every Arabic letter renders in its isolated
|
| 13 |
+
form (disconnected). reshaper converts to presentation forms BEFORE
|
| 14 |
+
Pillow draws, which is the only correct approach for PIL.ImageDraw.
|
| 15 |
+
Config: support_ligatures=True, delete_harakat=False (preserves tashkeel),
|
| 16 |
+
delete_tatweel=True (removes kashida for accurate width measurement).
|
| 17 |
- CJK/Thai/Devanagari not uppercased
|
| 18 |
- ensure_font() uses Config.detect_language_from_text() + Config.get_font_for_language()
|
| 19 |
+
- Arabic-specific font fallback: NotoSansArabic before NotoSans
|
| 20 |
- BBC/Netflix standards: max 42 chars/line, 2 lines max
|
| 21 |
+
- highlight_word mode uses pre-segmented SubtitleSegmenter blocks directly
|
| 22 |
+
- 3 new Arabic-optimised styles: cairo_bold, tajawal_bold, noto_arabic
|
| 23 |
"""
|
| 24 |
import os
|
| 25 |
import numpy as np
|
| 26 |
import urllib.request
|
| 27 |
from PIL import Image, ImageDraw, ImageFont
|
| 28 |
import moviepy.editor as mpe
|
| 29 |
+
from arabic_reshaper import ArabicReshaper # ✅ REQUIRED for Pillow Arabic rendering
|
| 30 |
from bidi.algorithm import get_display
|
| 31 |
from .config import Config
|
| 32 |
from .logger import Logger
|
| 33 |
|
| 34 |
logger = Logger.get_logger(__name__)
|
| 35 |
|
|
|
|
| 36 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 37 |
+
# Arabic Reshaper — configured once at module level (thread-safe, reusable)
|
| 38 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 39 |
#
|
| 40 |
+
# WHY reshaper is required:
|
| 41 |
+
# Pillow/FreeType renders each Unicode codepoint as its ISOLATED form.
|
| 42 |
+
# arabic_reshaper converts codepoints to contextual presentation forms
|
| 43 |
+
# (initial / medial / final / isolated) and joins ligatures.
|
| 44 |
+
# Then bidi reorders for right-to-left display.
|
| 45 |
+
# Without reshaper → every letter is disconnected (the bug in the screenshot).
|
| 46 |
+
#
|
| 47 |
+
# Config:
|
| 48 |
+
# support_ligatures = True → joins لا → ﻻ and other common ligatures
|
| 49 |
+
# delete_harakat = False → preserves tashkeel so bidi positions them correctly
|
| 50 |
+
# delete_tatweel = True → removes kashida (ـ) for accurate pixel measurement
|
| 51 |
#
|
| 52 |
+
_ARABIC_RESHAPER = ArabicReshaper(configuration={
|
| 53 |
+
"support_ligatures": True,
|
| 54 |
+
"delete_harakat": False,
|
| 55 |
+
"delete_tatweel": True,
|
| 56 |
+
})
|
| 57 |
+
|
| 58 |
+
# Arabic script Unicode ranges
|
| 59 |
+
_ARABIC_RANGES = [
|
| 60 |
+
("\u0600", "\u06FF"), # Arabic
|
| 61 |
+
("\u0750", "\u077F"), # Arabic Supplement
|
| 62 |
+
("\u08A0", "\u08FF"), # Arabic Extended-A
|
| 63 |
+
("\uFB50", "\uFDFF"), # Arabic Presentation Forms-A
|
| 64 |
+
("\uFE70", "\uFEFF"), # Arabic Presentation Forms-B
|
| 65 |
+
]
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 69 |
+
# Style Registry
|
| 70 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 71 |
STYLES = {
|
| 72 |
|
| 73 |
# ── 1. CLASSIC ────────────────────────────────────────────────────────────
|
|
|
|
| 124 |
},
|
| 125 |
|
| 126 |
# ── 4. TIKTOK NEON ────────────────────────────────────────────────────────
|
|
|
|
| 127 |
"tiktok_neon": {
|
| 128 |
"fontsize": 80,
|
| 129 |
"color": (255, 255, 255, 230),
|
|
|
|
| 176 |
(0, 9, 0, ( 0, 0, 0, 130)),
|
| 177 |
],
|
| 178 |
},
|
| 179 |
+
|
| 180 |
+
# ── 7. CAIRO BOLD (Arabic-optimised) ──────────────────────────────────��───
|
| 181 |
+
# Cairo: contemporary Arabic sans-serif, clean lines, harmonious Latin+Arabic
|
| 182 |
+
# mix, named best Arabic display font by Granshan 2016.
|
| 183 |
+
# Best for: Egyptian/Gulf social media, TikTok Arabic content.
|
| 184 |
+
"cairo_bold": {
|
| 185 |
+
"fontsize": 80,
|
| 186 |
+
"color": (255, 255, 255, 255),
|
| 187 |
+
"stroke_color": (0, 0, 0, 220),
|
| 188 |
+
"stroke_width": 4,
|
| 189 |
+
"font": "Cairo-Bold.ttf",
|
| 190 |
+
"bg_color": None,
|
| 191 |
+
"position": ("center", 0.82),
|
| 192 |
+
"highlight_color": (10, 10, 10, 255),
|
| 193 |
+
"highlight_bg": (255, 210, 0, 255),
|
| 194 |
+
"highlight_bg_radius": 14,
|
| 195 |
+
"shadow_layers": [
|
| 196 |
+
(3, 5, 0, (0, 0, 0, 210)),
|
| 197 |
+
(6, 9, 0, (0, 0, 0, 80)),
|
| 198 |
+
],
|
| 199 |
+
},
|
| 200 |
+
|
| 201 |
+
# ── 8. TAJAWAL BOLD (Arabic-optimised) ────────────────────────────────────
|
| 202 |
+
# Tajawal: modern geometric Arabic sans-serif, optimised for small screens
|
| 203 |
+
# and video subtitles, excellent readability, covers Latin too.
|
| 204 |
+
# Best for: YouTube Arabic captions, mixed Arabic/English content.
|
| 205 |
+
"tajawal_bold": {
|
| 206 |
+
"fontsize": 82,
|
| 207 |
+
"color": (255, 255, 255, 255),
|
| 208 |
+
"stroke_color": (0, 0, 0, 230),
|
| 209 |
+
"stroke_width": 4,
|
| 210 |
+
"font": "Tajawal-Bold.ttf",
|
| 211 |
+
"bg_color": (0, 0, 0, 150),
|
| 212 |
+
"position": ("center", 0.80),
|
| 213 |
+
"highlight_color": (255, 255, 255, 255),
|
| 214 |
+
"highlight_bg": (220, 50, 50, 245),
|
| 215 |
+
"highlight_bg_radius": 18,
|
| 216 |
+
"shadow_layers": [
|
| 217 |
+
(0, 4, 12, (180, 0, 0, 140)),
|
| 218 |
+
],
|
| 219 |
+
},
|
| 220 |
+
|
| 221 |
+
# ── 9. NOTO ARABIC (Universal Arabic) ─────────────────────────────────────
|
| 222 |
+
# NotoSansArabic: Google's reference Arabic font, covers all Arabic script
|
| 223 |
+
# variants (Arabic, Persian/Farsi, Urdu, Kurdish), 1642 glyphs.
|
| 224 |
+
# Best for: multilingual content, Persian/Urdu subtitles.
|
| 225 |
+
"noto_arabic": {
|
| 226 |
+
"fontsize": 76,
|
| 227 |
+
"color": (240, 240, 240, 230),
|
| 228 |
+
"stroke_color": (0, 0, 0, 180),
|
| 229 |
+
"stroke_width": 3,
|
| 230 |
+
"font": "NotoSansArabic-Bold.ttf",
|
| 231 |
+
"bg_color": (0, 0, 0, 155),
|
| 232 |
+
"position": ("center", 0.78),
|
| 233 |
+
"highlight_color": (20, 20, 20, 255),
|
| 234 |
+
"highlight_bg": (255, 200, 40, 248),
|
| 235 |
+
"highlight_bg_radius": 16,
|
| 236 |
+
"shadow_layers": [
|
| 237 |
+
(0, 4, 10, (180, 130, 0, 150)),
|
| 238 |
+
],
|
| 239 |
+
},
|
| 240 |
}
|
| 241 |
|
| 242 |
|
|
|
|
| 262 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 263 |
|
| 264 |
def _rgba(c):
|
|
|
|
| 265 |
if c is None:
|
| 266 |
return None
|
| 267 |
if isinstance(c, (tuple, list)):
|
|
|
|
| 271 |
|
| 272 |
|
| 273 |
def _should_uppercase(text: str) -> bool:
|
|
|
|
| 274 |
for start, end in _NO_UPPER_RANGES:
|
| 275 |
if any(start <= c <= end for c in text):
|
| 276 |
return False
|
| 277 |
return True
|
| 278 |
|
| 279 |
|
| 280 |
+
def _is_arabic_script(text: str) -> bool:
|
| 281 |
+
"""Returns True if text contains Arabic script characters."""
|
| 282 |
+
for start, end in _ARABIC_RANGES:
|
| 283 |
+
if any(start <= c <= end for c in text):
|
| 284 |
+
return True
|
| 285 |
+
return False
|
| 286 |
+
|
| 287 |
+
|
| 288 |
def _prepare_display_text(raw: str, is_rtl: bool, language: str = None) -> str:
|
| 289 |
"""
|
| 290 |
+
Prepares text for correct rendering in Pillow (PIL.ImageDraw).
|
| 291 |
+
|
| 292 |
+
Pipeline for Arabic/Persian/Urdu:
|
| 293 |
+
1. ArabicReshaper.reshape() — converts Unicode isolated codepoints to
|
| 294 |
+
contextual presentation forms + joins ligatures.
|
| 295 |
+
This is MANDATORY for Pillow because FreeType does NOT do this.
|
| 296 |
+
2. bidi.get_display() — reorders characters right-to-left.
|
| 297 |
+
|
| 298 |
+
Pipeline for Hebrew:
|
| 299 |
+
bidi.get_display() only — Hebrew has no contextual shaping requirement.
|
| 300 |
+
|
| 301 |
+
Pipeline for Latin/Cyrillic:
|
| 302 |
+
uppercase only.
|
| 303 |
+
|
| 304 |
+
Pipeline for CJK/Thai/Devanagari:
|
| 305 |
+
as-is (no uppercase, no bidi needed at the Pillow level).
|
| 306 |
"""
|
| 307 |
if not is_rtl:
|
| 308 |
return raw.upper() if _should_uppercase(raw) else raw
|
| 309 |
|
| 310 |
+
# ── Arabic script (ar, fa, ur, ckb …) ────────────────────────────────────
|
| 311 |
+
if _is_arabic_script(raw):
|
| 312 |
+
try:
|
| 313 |
+
reshaped = _ARABIC_RESHAPER.reshape(raw)
|
| 314 |
+
return get_display(reshaped)
|
| 315 |
+
except Exception as exc:
|
| 316 |
+
logger.warning(f"⚠️ Arabic reshape error for '{raw[:20]}…': {exc}")
|
| 317 |
+
try:
|
| 318 |
+
return get_display(raw) # fallback: bidi only (still broken but RTL)
|
| 319 |
+
except Exception:
|
| 320 |
+
return raw
|
| 321 |
|
| 322 |
+
# ── Hebrew and other RTL (bidi only) ──────────────────────────────────────
|
| 323 |
try:
|
| 324 |
+
return get_display(raw)
|
| 325 |
except Exception:
|
| 326 |
return raw
|
| 327 |
|
| 328 |
|
| 329 |
def _is_rtl_text(language: str, text: str) -> bool:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
if language and Config.is_rtl(language):
|
| 331 |
return True
|
| 332 |
if text:
|
|
|
|
| 337 |
|
| 338 |
|
| 339 |
def _draw_shadow_layers(draw, box, layers, base_radius):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
x1, y1, x2, y2 = box
|
| 341 |
for (ox, oy, blur, color) in layers:
|
| 342 |
rgba = _rgba(color)
|
|
|
|
| 365 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 366 |
class SubtitleManager:
|
| 367 |
|
|
|
|
| 368 |
@staticmethod
|
| 369 |
def ensure_font(language: str = None, style_name: str = None,
|
| 370 |
style_font: str = None, text_content: str = None) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
detected_lang = None
|
| 372 |
|
| 373 |
if language:
|
| 374 |
lang_val = language.value if hasattr(language, 'value') else str(language)
|
| 375 |
detected_lang = None if lang_val == 'auto' else lang_val
|
| 376 |
|
|
|
|
| 377 |
if not detected_lang and text_content:
|
| 378 |
detected_lang = Config.detect_language_from_text(text_content)
|
| 379 |
|
|
|
|
| 380 |
if detected_lang:
|
| 381 |
font_name = Config.get_font_for_language(detected_lang, style_name)
|
| 382 |
elif style_font:
|
|
|
|
| 386 |
|
| 387 |
logger.debug(f"🔤 Font resolved: lang={detected_lang} style={style_name} → {font_name}")
|
| 388 |
|
|
|
|
| 389 |
font_path = os.path.join(Config.BASE_DIR, font_name)
|
| 390 |
|
| 391 |
if not os.path.exists(font_path):
|
|
|
|
| 403 |
except Exception as exc:
|
| 404 |
logger.error(f"❌ Font download failed for {font_name}: {exc}")
|
| 405 |
|
| 406 |
+
# Arabic-specific fallback chain
|
| 407 |
+
is_arabic_lang = detected_lang in ("ar", "fa", "ur", "ckb")
|
| 408 |
+
fallback_name = "NotoSansArabic-Bold.ttf" if is_arabic_lang else "NotoSans-Bold.ttf"
|
| 409 |
+
fallback_path = os.path.join(Config.BASE_DIR, fallback_name)
|
| 410 |
+
|
| 411 |
if not os.path.exists(fallback_path):
|
| 412 |
fallback_url = Config.FONTS.get(fallback_name)
|
| 413 |
if fallback_url:
|
|
|
|
| 417 |
pass
|
| 418 |
|
| 419 |
if os.path.exists(fallback_path):
|
| 420 |
+
logger.warning(f"⚠️ Using {fallback_name} fallback instead of {font_name}")
|
| 421 |
return fallback_path
|
| 422 |
|
|
|
|
| 423 |
logger.error("❌ All font downloads failed, falling back to system Arial")
|
| 424 |
return "Arial"
|
| 425 |
else:
|
|
|
|
| 427 |
|
| 428 |
return font_path
|
| 429 |
|
|
|
|
| 430 |
@staticmethod
|
| 431 |
def wrap_text(text: str, font, max_width: int) -> list:
|
|
|
|
| 432 |
lines = []
|
| 433 |
words = text.split()
|
| 434 |
if not words:
|
|
|
|
| 444 |
|
| 445 |
if width > max_width:
|
| 446 |
if len(current_line) == 1:
|
|
|
|
| 447 |
lines.append(current_line.pop())
|
| 448 |
else:
|
| 449 |
last = current_line.pop()
|
|
|
|
| 455 |
|
| 456 |
return lines
|
| 457 |
|
|
|
|
| 458 |
@staticmethod
|
| 459 |
def create_pil_text_clip(text: str, fontsize: int, color, font_path: str,
|
| 460 |
stroke_color=(0, 0, 0, 200), stroke_width: int = 2,
|
| 461 |
bg_color=None, padding: int = 12, bg_radius: int = 18,
|
| 462 |
max_width: int = None):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 463 |
try:
|
| 464 |
try:
|
| 465 |
font = ImageFont.truetype(font_path, fontsize)
|
|
|
|
| 470 |
dummy = Image.new("RGBA", (1, 1))
|
| 471 |
d = ImageDraw.Draw(dummy)
|
| 472 |
|
|
|
|
| 473 |
lines = [text]
|
| 474 |
if max_width:
|
| 475 |
avail = max_width - padding * 4
|
| 476 |
lines = SubtitleManager.wrap_text(text, font, avail)
|
| 477 |
|
|
|
|
| 478 |
line_metrics = []
|
| 479 |
max_w = 0
|
| 480 |
total_h = 0
|
|
|
|
| 505 |
|
| 506 |
current_y = margin
|
| 507 |
for m in line_metrics:
|
|
|
|
| 508 |
lx = (iw - m["w"]) / 2 - m["bbox"][0]
|
| 509 |
ly = current_y - m["bbox"][1]
|
| 510 |
draw.text(
|
|
|
|
| 521 |
logger.error(f"⚠️ create_pil_text_clip error: {exc}")
|
| 522 |
return None
|
| 523 |
|
|
|
|
| 524 |
@staticmethod
|
| 525 |
def create_sentence_highlight_clip(
|
| 526 |
sentence_words: list,
|
| 527 |
+
active_word_index: int,
|
| 528 |
font,
|
| 529 |
fontsize: int,
|
| 530 |
font_path: str,
|
| 531 |
style_config: dict,
|
| 532 |
+
is_rtl: bool,
|
| 533 |
+
language: str = None,
|
| 534 |
padding: int = 14,
|
| 535 |
bg_radius: int = 20,
|
| 536 |
max_width: int = None,
|
| 537 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 538 |
try:
|
| 539 |
dummy = Image.new("RGBA", (1, 1))
|
| 540 |
d = ImageDraw.Draw(dummy)
|
| 541 |
space_w = d.textbbox((0, 0), " ", font=font)[2]
|
| 542 |
|
|
|
|
| 543 |
words_data = []
|
| 544 |
+
ordered = list(reversed(sentence_words)) if is_rtl else sentence_words
|
|
|
|
| 545 |
|
| 546 |
for idx, w in enumerate(ordered):
|
| 547 |
raw = w.get("text", "")
|
| 548 |
display = _prepare_display_text(raw, is_rtl, language)
|
| 549 |
bbox = d.textbbox((0, 0), display, font=font)
|
| 550 |
words_data.append({
|
| 551 |
+
"index": idx,
|
| 552 |
"text": display,
|
| 553 |
"w": bbox[2] - bbox[0],
|
| 554 |
"h": bbox[3] - bbox[1],
|
| 555 |
"bbox": bbox,
|
| 556 |
})
|
| 557 |
|
|
|
|
| 558 |
n = len(sentence_words)
|
| 559 |
effective_active_index = (
|
| 560 |
(n - 1 - active_word_index)
|
|
|
|
| 562 |
else active_word_index
|
| 563 |
)
|
| 564 |
|
|
|
|
| 565 |
lines = []
|
| 566 |
current_line = []
|
| 567 |
current_w = 0
|
|
|
|
| 582 |
if current_line:
|
| 583 |
lines.append(current_line)
|
| 584 |
|
|
|
|
| 585 |
line_spacing = int(fontsize * 0.2)
|
| 586 |
stroke_w = style_config.get("stroke_width", 2)
|
| 587 |
margin = int(stroke_w * 2) + padding
|
|
|
|
| 605 |
img = Image.new("RGBA", (int(iw), int(ih)), (0, 0, 0, 0))
|
| 606 |
draw = ImageDraw.Draw(img)
|
| 607 |
|
|
|
|
| 608 |
hl_bg = style_config.get("highlight_bg")
|
| 609 |
hl_radius = style_config.get("highlight_bg_radius", bg_radius)
|
| 610 |
shadows = style_config.get("shadow_layers", [])
|
| 611 |
|
| 612 |
for i, line in enumerate(lines):
|
|
|
|
| 613 |
lx = margin + (canvas_w - line_infos[i]["w"]) // 2
|
| 614 |
ly = margin + bleed // 2 + line_infos[i]["y"]
|
| 615 |
cx = lx
|
|
|
|
| 622 |
by2 = ly + wd["h"] + padding // 2
|
| 623 |
box = (bx1, by1, bx2, by2)
|
| 624 |
|
|
|
|
| 625 |
if shadows:
|
| 626 |
_draw_shadow_layers(draw, box, shadows, hl_radius)
|
| 627 |
|
|
|
|
| 628 |
draw.rounded_rectangle(
|
| 629 |
[(bx1, by1), (bx2, by2)],
|
| 630 |
radius=hl_radius,
|
|
|
|
| 633 |
|
| 634 |
cx += wd["w"] + space_w
|
| 635 |
|
|
|
|
| 636 |
rest_c = _rgba(style_config.get("color", (255, 255, 255, 255)))
|
| 637 |
hl_c = _rgba(style_config.get("highlight_color", rest_c))
|
| 638 |
stk_c = _rgba(style_config.get("stroke_color", (0, 0, 0, 255)))
|
|
|
|
| 658 |
logger.error(f"⚠️ create_sentence_highlight_clip error: {exc}")
|
| 659 |
return None
|
| 660 |
|
|
|
|
| 661 |
@staticmethod
|
| 662 |
def get_style_config(style_name: str) -> dict:
|
|
|
|
| 663 |
return STYLES.get(style_name, STYLES["classic"])
|
| 664 |
|
|
|
|
| 665 |
@staticmethod
|
| 666 |
def create_caption_clips(
|
| 667 |
transcript_data,
|
|
|
|
| 673 |
"""
|
| 674 |
Generates all caption ImageClips ready for compositing.
|
| 675 |
|
| 676 |
+
Arabic caption_style recommendations:
|
| 677 |
+
"cairo_bold" → best for Egyptian/Gulf social media content
|
| 678 |
+
"tajawal_bold" → modern geometric, dark background, great readability
|
| 679 |
+
"noto_arabic" → universal, covers Arabic/Persian/Urdu/Kurdish
|
| 680 |
"""
|
| 681 |
all_clips = []
|
| 682 |
style_cfg = SubtitleManager.get_style_config(caption_style)
|
| 683 |
|
|
|
|
| 684 |
segments = []
|
| 685 |
sample_text = ""
|
| 686 |
|
|
|
|
| 697 |
sample_text = s["text"]
|
| 698 |
break
|
| 699 |
|
|
|
|
| 700 |
font_path = SubtitleManager.ensure_font(
|
| 701 |
language = language,
|
| 702 |
style_name = caption_style,
|
|
|
|
| 711 |
# MODE: highlight_word
|
| 712 |
# ══════════════════════════════════════════════════════════════════════
|
| 713 |
if caption_mode == "highlight_word":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 714 |
fontsize = style_cfg.get("fontsize", 75)
|
| 715 |
try:
|
| 716 |
font = ImageFont.truetype(font_path, fontsize)
|
|
|
|
| 718 |
logger.warning("⚠️ TrueType load failed — using default font.")
|
| 719 |
font = ImageFont.load_default()
|
| 720 |
|
| 721 |
+
for seg in segments:
|
| 722 |
+
sw = seg.get("words", [])
|
| 723 |
+
if not sw:
|
| 724 |
+
logger.warning(
|
| 725 |
+
f"⚠️ Segment [{seg.get('start', 0):.2f}s] has no word timestamps, skipping."
|
| 726 |
+
)
|
| 727 |
continue
|
| 728 |
+
|
| 729 |
+
sent_start = seg.get("start", sw[0]["start"])
|
| 730 |
+
sent_end = seg.get("end", sw[-1]["end"])
|
| 731 |
+
sent_text = seg.get("text", " ".join(w["text"] for w in sw))
|
| 732 |
+
is_rtl = _is_rtl_text(language, sent_text)
|
| 733 |
+
|
| 734 |
+
for active_idx, active_word in enumerate(sw):
|
| 735 |
+
w_start = active_word.get("start", sent_start)
|
| 736 |
+
w_end = active_word.get("end", sent_end)
|
| 737 |
+
if w_end <= w_start:
|
| 738 |
+
w_end = w_start + 0.05
|
| 739 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 740 |
clip = SubtitleManager.create_sentence_highlight_clip(
|
| 741 |
sentence_words = sw,
|
| 742 |
+
active_word_index = active_idx,
|
| 743 |
font = font,
|
| 744 |
fontsize = fontsize,
|
| 745 |
font_path = font_path,
|
| 746 |
style_config = style_cfg,
|
| 747 |
+
is_rtl = is_rtl,
|
| 748 |
language = language,
|
| 749 |
padding = style_cfg.get("padding", 14),
|
| 750 |
bg_radius = style_cfg.get("highlight_bg_radius", 20),
|
|
|
|
| 752 |
)
|
| 753 |
if clip:
|
| 754 |
all_clips.append(
|
| 755 |
+
clip.set_start(w_start)
|
| 756 |
+
.set_end(w_end)
|
| 757 |
.set_position(pos)
|
| 758 |
)
|
| 759 |
|
|
|
|
| 760 |
covered = [(w["start"], w["end"]) for w in sw]
|
| 761 |
gaps = []
|
| 762 |
+
|
| 763 |
if sent_start < covered[0][0]:
|
| 764 |
gaps.append((sent_start, covered[0][0]))
|
| 765 |
for j in range(len(covered) - 1):
|
|
|
|
| 768 |
if covered[-1][1] < sent_end:
|
| 769 |
gaps.append((covered[-1][1], sent_end))
|
| 770 |
|
| 771 |
+
plain_cfg = {**style_cfg, "highlight_bg": None, "shadow_layers": []}
|
| 772 |
for gs, ge in gaps:
|
| 773 |
+
if ge - gs < 0.02:
|
| 774 |
+
continue
|
| 775 |
gc = SubtitleManager.create_sentence_highlight_clip(
|
| 776 |
sentence_words = sw,
|
| 777 |
+
active_word_index = -1,
|
| 778 |
font = font,
|
| 779 |
fontsize = fontsize,
|
| 780 |
font_path = font_path,
|
|
|
|
| 807 |
else:
|
| 808 |
continue
|
| 809 |
|
|
|
|
| 810 |
line1 = seg.get("_line1", "")
|
| 811 |
line2 = seg.get("_line2", "")
|
| 812 |
|
| 813 |
if line1:
|
|
|
|
| 814 |
display_text = f"{line1}\n{line2}".strip() if line2 else line1
|
| 815 |
chunks = [{"text": display_text, "start": start_t, "end": end_t}]
|
| 816 |
else:
|
|
|
|
| 817 |
chunk_size = 1 if caption_mode == "word" else 4
|
| 818 |
chunks = []
|
| 819 |
stt_words = seg.get("words")
|
|
|
|
| 841 |
|
| 842 |
for chunk in chunks:
|
| 843 |
disp = chunk["text"]
|
| 844 |
+
is_rtl = _is_rtl_text(language, disp)
|
| 845 |
disp = _prepare_display_text(disp, is_rtl, language)
|
| 846 |
|
| 847 |
clip = SubtitleManager.create_pil_text_clip(
|
|
|
|
| 864 |
|
| 865 |
return all_clips
|
| 866 |
|
|
|
|
| 867 |
@staticmethod
|
| 868 |
def create_captions(
|
| 869 |
video_clip,
|
|
|
|
| 873 |
caption_mode: str = "sentence",
|
| 874 |
caption_style: str = "classic",
|
| 875 |
):
|
|
|
|
| 876 |
clips = SubtitleManager.create_caption_clips(
|
| 877 |
transcript_data,
|
| 878 |
size = size,
|
processor.py
CHANGED
|
@@ -9,6 +9,10 @@ Fixes applied:
|
|
| 9 |
- style string normalised once
|
| 10 |
- get_best_segments wired into process_video
|
| 11 |
- detected_lang used correctly for captions
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
"""
|
| 13 |
import os
|
| 14 |
import gc
|
|
@@ -20,14 +24,58 @@ import json_repair
|
|
| 20 |
import core # Applies monkey patches
|
| 21 |
from core.config import Config
|
| 22 |
from core.logger import Logger
|
| 23 |
-
from core.stt import STT
|
| 24 |
-
from core.analyze import
|
| 25 |
from core.styles import StyleFactory
|
| 26 |
from core.subtitle_manager import SubtitleManager
|
| 27 |
from core.free_translator import FreeTranslator
|
| 28 |
|
| 29 |
logger = Logger.get_logger(__name__)
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 33 |
class VideoProcessor:
|
|
@@ -60,7 +108,6 @@ class VideoProcessor:
|
|
| 60 |
except Exception as e:
|
| 61 |
logger.warning(f"⚠️ json_repair failed, using raw content: {e}")
|
| 62 |
|
| 63 |
-
# Last-resort brace balancing
|
| 64 |
open_b = content.count("{")
|
| 65 |
close_b = content.count("}")
|
| 66 |
if open_b > close_b:
|
|
@@ -89,7 +136,6 @@ class VideoProcessor:
|
|
| 89 |
for key in ("segments", "clips", "moments"):
|
| 90 |
if key in segments_data and isinstance(segments_data[key], list):
|
| 91 |
return segments_data[key]
|
| 92 |
-
# Fallback: first list value found
|
| 93 |
for v in segments_data.values():
|
| 94 |
if isinstance(v, list):
|
| 95 |
return v
|
|
@@ -104,18 +150,17 @@ class VideoProcessor:
|
|
| 104 |
|
| 105 |
def analyze_impact(self,
|
| 106 |
video_path,
|
| 107 |
-
source_language=None,
|
| 108 |
-
target_language=None,
|
| 109 |
timestamp_mode="segments",
|
| 110 |
progress_callback=None):
|
| 111 |
"""
|
| 112 |
STT + AI viral-moment detection.
|
| 113 |
|
| 114 |
-
source_language :
|
| 115 |
-
|
| 116 |
-
target_language :
|
| 117 |
-
|
| 118 |
-
Returns (unique_segments, duration, data)
|
| 119 |
"""
|
| 120 |
if progress_callback:
|
| 121 |
progress_callback(5, "Starting speech-to-text...")
|
|
@@ -126,7 +171,7 @@ class VideoProcessor:
|
|
| 126 |
|
| 127 |
full_segments, full_text, duration, detected_lang = self.stt.get_transcript(
|
| 128 |
video_path,
|
| 129 |
-
language=source_language,
|
| 130 |
skip_ai=True,
|
| 131 |
timestamp_mode=timestamp_mode,
|
| 132 |
)
|
|
@@ -135,8 +180,8 @@ class VideoProcessor:
|
|
| 135 |
|
| 136 |
data = {
|
| 137 |
"segments": full_segments,
|
| 138 |
-
"detected_language": detected_lang,
|
| 139 |
-
"target_language": target_language,
|
| 140 |
"duration": duration,
|
| 141 |
}
|
| 142 |
|
|
@@ -174,7 +219,7 @@ class VideoProcessor:
|
|
| 174 |
f"{min(current_end, max_time)/60:.1f}m …"
|
| 175 |
)
|
| 176 |
|
| 177 |
-
ai_res =
|
| 178 |
logger.info(f"🤖 AI response type: {type(ai_res)}")
|
| 179 |
|
| 180 |
try:
|
|
@@ -189,7 +234,6 @@ class VideoProcessor:
|
|
| 189 |
if current_end >= max_time:
|
| 190 |
break
|
| 191 |
|
| 192 |
-
# Deduplicate by start_time
|
| 193 |
seen, unique = set(), []
|
| 194 |
for s in all_ai_segs:
|
| 195 |
st = s.get("start_time")
|
|
@@ -218,14 +262,19 @@ class VideoProcessor:
|
|
| 218 |
"""
|
| 219 |
Cuts, styles, captions, and exports each viral clip.
|
| 220 |
|
| 221 |
-
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
"""
|
| 224 |
logger.info("🎨 Phase 3: Style & Captions …")
|
| 225 |
if progress_callback:
|
| 226 |
progress_callback(60, "Generating clips …")
|
| 227 |
|
| 228 |
-
# ── Video duration ────────────────────────────────────────────────────
|
| 229 |
video_duration = data.get("duration") or 0
|
| 230 |
if not video_duration:
|
| 231 |
try:
|
|
@@ -235,17 +284,9 @@ class VideoProcessor:
|
|
| 235 |
logger.error(f"❌ Could not determine video duration: {e}")
|
| 236 |
|
| 237 |
# ── Language resolution ───────────────────────────────────────────────
|
| 238 |
-
#
|
| 239 |
-
# detected_lang = اللغة الفعلية للفيديو (من Whisper)
|
| 240 |
-
# target_language = اللغة المطلوبة للـ output (من الريكويست)
|
| 241 |
-
#
|
| 242 |
-
# needs_translation = True → نترجم النص
|
| 243 |
-
# caption_lang = اللغة اللي هيتعمل بيها الكابشن
|
| 244 |
-
#
|
| 245 |
detected_lang = data.get("detected_language", "en")
|
| 246 |
-
target_language = data.get("target_language")
|
| 247 |
|
| 248 |
-
# normalize
|
| 249 |
if hasattr(target_language, "value"):
|
| 250 |
target_language = target_language.value
|
| 251 |
|
|
@@ -255,7 +296,6 @@ class VideoProcessor:
|
|
| 255 |
and target_language != detected_lang
|
| 256 |
)
|
| 257 |
|
| 258 |
-
# الكابشن بيتعمل بلغة الـ output لو فيه ترجمة، وإلا بلغة الفيديو الأصلي
|
| 259 |
caption_lang = target_language if needs_translation else detected_lang
|
| 260 |
|
| 261 |
translator = FreeTranslator() if needs_translation else None
|
|
@@ -312,46 +352,53 @@ class VideoProcessor:
|
|
| 312 |
final_output = os.path.join(Config.OUTPUTS_DIR, "viral_clips", out_name)
|
| 313 |
os.makedirs(os.path.dirname(final_output), exist_ok=True)
|
| 314 |
|
| 315 |
-
# ── Cut clip
|
| 316 |
current_video_clip = mpe.VideoFileClip(input_video_path)
|
| 317 |
clip = current_video_clip.subclip(start, end)
|
| 318 |
|
| 319 |
# ── Build segment_transcript ──────────────────────────────────
|
| 320 |
-
# الترجمة بتحصل هنا فقط — مفيش أي مكان تاني بيعدّل على data
|
| 321 |
segment_transcript = {"segments": []}
|
| 322 |
|
| 323 |
for s in data["segments"]:
|
| 324 |
if s["start"] >= end or s["end"] <= start:
|
| 325 |
continue
|
| 326 |
|
| 327 |
-
new_seg
|
| 328 |
new_seg["start"] = max(0, s["start"] - start)
|
| 329 |
new_seg["end"] = min(end - start, s["end"] - start)
|
| 330 |
|
| 331 |
if needs_translation and translator:
|
| 332 |
-
#
|
| 333 |
try:
|
| 334 |
translated_text, _ = translator.translate_text(
|
| 335 |
-
s["text"], target_language
|
| 336 |
)
|
| 337 |
except Exception as te:
|
| 338 |
logger.warning(f"⚠️ Translation error: {te}")
|
| 339 |
translated_text = s["text"]
|
| 340 |
|
| 341 |
new_seg["text"] = translated_text
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
else:
|
| 354 |
-
#
|
| 355 |
if "words" in s:
|
| 356 |
new_seg["words"] = [
|
| 357 |
{
|
|
@@ -362,6 +409,8 @@ class VideoProcessor:
|
|
| 362 |
for w in s["words"]
|
| 363 |
if w["start"] < end and w["end"] > start
|
| 364 |
]
|
|
|
|
|
|
|
| 365 |
|
| 366 |
segment_transcript["segments"].append(new_seg)
|
| 367 |
|
|
@@ -418,20 +467,20 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
|
|
| 418 |
"""
|
| 419 |
End-to-end pipeline: STT → AI analysis → clip export.
|
| 420 |
|
| 421 |
-
|
| 422 |
-
source_language :
|
| 423 |
-
|
| 424 |
-
language :
|
| 425 |
-
|
| 426 |
caption_mode : sentence | word | highlight_word
|
| 427 |
-
caption_style : classic | modern_glow | tiktok_bold |
|
| 428 |
"""
|
| 429 |
try:
|
| 430 |
processor = VideoProcessor(model_size=model_size)
|
| 431 |
|
| 432 |
caption_mode = kwargs.get("caption_mode", "sentence")
|
| 433 |
|
| 434 |
-
# highlight_word
|
| 435 |
timestamp_mode = (
|
| 436 |
"words"
|
| 437 |
if caption_mode in ("word", "highlight_word")
|
|
@@ -441,8 +490,8 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
|
|
| 441 |
# Phase 1 + 2: STT + AI analysis
|
| 442 |
viral_segments, duration, stt_data = processor.analyze_impact(
|
| 443 |
video_path,
|
| 444 |
-
source_language = kwargs.get("source_language"),
|
| 445 |
-
target_language = kwargs.get("language"),
|
| 446 |
timestamp_mode = timestamp_mode,
|
| 447 |
)
|
| 448 |
|
|
@@ -450,7 +499,6 @@ def process_video(video_path, style="cinematic_blur", model_size="base", **kwarg
|
|
| 450 |
logger.warning("⚠️ No viral segments found.")
|
| 451 |
return []
|
| 452 |
|
| 453 |
-
# Sort by viral score
|
| 454 |
best_clips = processor.get_best_segments(viral_segments, duration)
|
| 455 |
|
| 456 |
# Phase 3: render
|
|
|
|
| 9 |
- style string normalised once
|
| 10 |
- get_best_segments wired into process_video
|
| 11 |
- detected_lang used correctly for captions
|
| 12 |
+
- ✅ FIX: after translation, _line1/_line2 re-computed from translated text
|
| 13 |
+
using SubtitleSegmenter._split_into_lines so line splits match translated content
|
| 14 |
+
- ✅ FIX: translated word timestamps distributed proportional to word length
|
| 15 |
+
(instead of uniform distribution) for better highlight sync
|
| 16 |
"""
|
| 17 |
import os
|
| 18 |
import gc
|
|
|
|
| 24 |
import core # Applies monkey patches
|
| 25 |
from core.config import Config
|
| 26 |
from core.logger import Logger
|
| 27 |
+
from core.stt import STT, SubtitleSegmenter
|
| 28 |
+
from core.analyze import analyze_transcript
|
| 29 |
from core.styles import StyleFactory
|
| 30 |
from core.subtitle_manager import SubtitleManager
|
| 31 |
from core.free_translator import FreeTranslator
|
| 32 |
|
| 33 |
logger = Logger.get_logger(__name__)
|
| 34 |
|
| 35 |
+
# Max chars per line — must match SubtitleSegmenter constant
|
| 36 |
+
_MAX_CHARS_PER_LINE = 42
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _distribute_timestamps_by_length(words: list, seg_start: float, seg_end: float) -> list:
|
| 40 |
+
"""
|
| 41 |
+
✅ FIX: Distribute word timestamps proportional to character length instead of
|
| 42 |
+
uniform distribution. Longer words get more time, giving better sync in
|
| 43 |
+
highlight_word mode after translation.
|
| 44 |
+
|
| 45 |
+
words: list of str (translated words)
|
| 46 |
+
Returns: list of { text, start, end }
|
| 47 |
+
"""
|
| 48 |
+
if not words:
|
| 49 |
+
return []
|
| 50 |
+
|
| 51 |
+
total_chars = sum(len(w) for w in words)
|
| 52 |
+
seg_dur = seg_end - seg_start
|
| 53 |
+
|
| 54 |
+
result = []
|
| 55 |
+
cursor = seg_start
|
| 56 |
+
|
| 57 |
+
for i, w in enumerate(words):
|
| 58 |
+
if total_chars > 0:
|
| 59 |
+
fraction = len(w) / total_chars
|
| 60 |
+
else:
|
| 61 |
+
fraction = 1.0 / len(words)
|
| 62 |
+
|
| 63 |
+
w_dur = seg_dur * fraction
|
| 64 |
+
w_end = cursor + w_dur
|
| 65 |
+
|
| 66 |
+
# Clamp last word to seg_end to avoid float drift
|
| 67 |
+
if i == len(words) - 1:
|
| 68 |
+
w_end = seg_end
|
| 69 |
+
|
| 70 |
+
result.append({
|
| 71 |
+
"text": w,
|
| 72 |
+
"start": round(cursor, 3),
|
| 73 |
+
"end": round(w_end, 3),
|
| 74 |
+
})
|
| 75 |
+
cursor = w_end
|
| 76 |
+
|
| 77 |
+
return result
|
| 78 |
+
|
| 79 |
|
| 80 |
# ─────────────────────────────────────────────────────────────────────────────
|
| 81 |
class VideoProcessor:
|
|
|
|
| 108 |
except Exception as e:
|
| 109 |
logger.warning(f"⚠️ json_repair failed, using raw content: {e}")
|
| 110 |
|
|
|
|
| 111 |
open_b = content.count("{")
|
| 112 |
close_b = content.count("}")
|
| 113 |
if open_b > close_b:
|
|
|
|
| 136 |
for key in ("segments", "clips", "moments"):
|
| 137 |
if key in segments_data and isinstance(segments_data[key], list):
|
| 138 |
return segments_data[key]
|
|
|
|
| 139 |
for v in segments_data.values():
|
| 140 |
if isinstance(v, list):
|
| 141 |
return v
|
|
|
|
| 150 |
|
| 151 |
def analyze_impact(self,
|
| 152 |
video_path,
|
| 153 |
+
source_language=None,
|
| 154 |
+
target_language=None,
|
| 155 |
timestamp_mode="segments",
|
| 156 |
progress_callback=None):
|
| 157 |
"""
|
| 158 |
STT + AI viral-moment detection.
|
| 159 |
|
| 160 |
+
source_language : passed directly to Whisper.
|
| 161 |
+
None → Whisper auto-detects (slower but safe).
|
| 162 |
+
target_language : stored in data for process_clips to use for
|
| 163 |
+
translation and caption rendering.
|
|
|
|
| 164 |
"""
|
| 165 |
if progress_callback:
|
| 166 |
progress_callback(5, "Starting speech-to-text...")
|
|
|
|
| 171 |
|
| 172 |
full_segments, full_text, duration, detected_lang = self.stt.get_transcript(
|
| 173 |
video_path,
|
| 174 |
+
language=source_language,
|
| 175 |
skip_ai=True,
|
| 176 |
timestamp_mode=timestamp_mode,
|
| 177 |
)
|
|
|
|
| 180 |
|
| 181 |
data = {
|
| 182 |
"segments": full_segments,
|
| 183 |
+
"detected_language": detected_lang,
|
| 184 |
+
"target_language": target_language,
|
| 185 |
"duration": duration,
|
| 186 |
}
|
| 187 |
|
|
|
|
| 219 |
f"{min(current_end, max_time)/60:.1f}m …"
|
| 220 |
)
|
| 221 |
|
| 222 |
+
ai_res = analyze_transcript(chunk_transcript)
|
| 223 |
logger.info(f"🤖 AI response type: {type(ai_res)}")
|
| 224 |
|
| 225 |
try:
|
|
|
|
| 234 |
if current_end >= max_time:
|
| 235 |
break
|
| 236 |
|
|
|
|
| 237 |
seen, unique = set(), []
|
| 238 |
for s in all_ai_segs:
|
| 239 |
st = s.get("start_time")
|
|
|
|
| 262 |
"""
|
| 263 |
Cuts, styles, captions, and exports each viral clip.
|
| 264 |
|
| 265 |
+
✅ FIX 1: After translation, _line1 and _line2 are re-computed from
|
| 266 |
+
the translated text using SubtitleSegmenter._split_into_lines.
|
| 267 |
+
Previously they were left as the original-language splits which
|
| 268 |
+
caused wrong line breaks in the translated captions.
|
| 269 |
+
|
| 270 |
+
✅ FIX 2: Word timestamps after translation are distributed proportional
|
| 271 |
+
to character length (via _distribute_timestamps_by_length) instead of
|
| 272 |
+
uniform distribution, giving better sync in highlight_word mode.
|
| 273 |
"""
|
| 274 |
logger.info("🎨 Phase 3: Style & Captions …")
|
| 275 |
if progress_callback:
|
| 276 |
progress_callback(60, "Generating clips …")
|
| 277 |
|
|
|
|
| 278 |
video_duration = data.get("duration") or 0
|
| 279 |
if not video_duration:
|
| 280 |
try:
|
|
|
|
| 284 |
logger.error(f"❌ Could not determine video duration: {e}")
|
| 285 |
|
| 286 |
# ── Language resolution ───────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
detected_lang = data.get("detected_language", "en")
|
| 288 |
+
target_language = data.get("target_language")
|
| 289 |
|
|
|
|
| 290 |
if hasattr(target_language, "value"):
|
| 291 |
target_language = target_language.value
|
| 292 |
|
|
|
|
| 296 |
and target_language != detected_lang
|
| 297 |
)
|
| 298 |
|
|
|
|
| 299 |
caption_lang = target_language if needs_translation else detected_lang
|
| 300 |
|
| 301 |
translator = FreeTranslator() if needs_translation else None
|
|
|
|
| 352 |
final_output = os.path.join(Config.OUTPUTS_DIR, "viral_clips", out_name)
|
| 353 |
os.makedirs(os.path.dirname(final_output), exist_ok=True)
|
| 354 |
|
| 355 |
+
# ── Cut clip ──────────────────────────────────────────────────
|
| 356 |
current_video_clip = mpe.VideoFileClip(input_video_path)
|
| 357 |
clip = current_video_clip.subclip(start, end)
|
| 358 |
|
| 359 |
# ── Build segment_transcript ──────────────────────────────────
|
|
|
|
| 360 |
segment_transcript = {"segments": []}
|
| 361 |
|
| 362 |
for s in data["segments"]:
|
| 363 |
if s["start"] >= end or s["end"] <= start:
|
| 364 |
continue
|
| 365 |
|
| 366 |
+
new_seg = s.copy()
|
| 367 |
new_seg["start"] = max(0, s["start"] - start)
|
| 368 |
new_seg["end"] = min(end - start, s["end"] - start)
|
| 369 |
|
| 370 |
if needs_translation and translator:
|
| 371 |
+
# ── Translate text ────────────────────────────────────
|
| 372 |
try:
|
| 373 |
translated_text, _ = translator.translate_text(
|
| 374 |
+
s["text"], target_language, detected_lang
|
| 375 |
)
|
| 376 |
except Exception as te:
|
| 377 |
logger.warning(f"⚠️ Translation error: {te}")
|
| 378 |
translated_text = s["text"]
|
| 379 |
|
| 380 |
new_seg["text"] = translated_text
|
| 381 |
+
|
| 382 |
+
# ✅ FIX 1: Re-compute line splits from TRANSLATED text.
|
| 383 |
+
# Original _line1/_line2 are in the source language and
|
| 384 |
+
# will have wrong split points after translation.
|
| 385 |
+
translated_lines = SubtitleSegmenter._split_into_lines(
|
| 386 |
+
translated_text, _MAX_CHARS_PER_LINE
|
| 387 |
+
)
|
| 388 |
+
new_seg["_line1"] = translated_lines[0] if len(translated_lines) > 0 else translated_text
|
| 389 |
+
new_seg["_line2"] = translated_lines[1] if len(translated_lines) > 1 else ""
|
| 390 |
+
|
| 391 |
+
# ✅ FIX 2: Distribute word timestamps proportional to
|
| 392 |
+
# character length for better highlight_word sync.
|
| 393 |
+
translated_words = translated_text.split()
|
| 394 |
+
new_seg["words"] = _distribute_timestamps_by_length(
|
| 395 |
+
translated_words,
|
| 396 |
+
new_seg["start"],
|
| 397 |
+
new_seg["end"],
|
| 398 |
+
)
|
| 399 |
+
|
| 400 |
else:
|
| 401 |
+
# No translation — adjust existing word timestamps
|
| 402 |
if "words" in s:
|
| 403 |
new_seg["words"] = [
|
| 404 |
{
|
|
|
|
| 409 |
for w in s["words"]
|
| 410 |
if w["start"] < end and w["end"] > start
|
| 411 |
]
|
| 412 |
+
# _line1/_line2 already correct from SubtitleSegmenter
|
| 413 |
+
# (already in source lang which IS caption lang here)
|
| 414 |
|
| 415 |
segment_transcript["segments"].append(new_seg)
|
| 416 |
|
|
|
|
| 467 |
"""
|
| 468 |
End-to-end pipeline: STT → AI analysis → clip export.
|
| 469 |
|
| 470 |
+
Important kwargs:
|
| 471 |
+
source_language : language of the original video → passed to Whisper.
|
| 472 |
+
If not set → Whisper auto-detects.
|
| 473 |
+
language : desired output language (translation + captions).
|
| 474 |
+
If same as source → no translation.
|
| 475 |
caption_mode : sentence | word | highlight_word
|
| 476 |
+
caption_style : classic | modern_glow | tiktok_bold | …
|
| 477 |
"""
|
| 478 |
try:
|
| 479 |
processor = VideoProcessor(model_size=model_size)
|
| 480 |
|
| 481 |
caption_mode = kwargs.get("caption_mode", "sentence")
|
| 482 |
|
| 483 |
+
# highlight_word and word modes both need word-level timestamps
|
| 484 |
timestamp_mode = (
|
| 485 |
"words"
|
| 486 |
if caption_mode in ("word", "highlight_word")
|
|
|
|
| 490 |
# Phase 1 + 2: STT + AI analysis
|
| 491 |
viral_segments, duration, stt_data = processor.analyze_impact(
|
| 492 |
video_path,
|
| 493 |
+
source_language = kwargs.get("source_language"),
|
| 494 |
+
target_language = kwargs.get("language"),
|
| 495 |
timestamp_mode = timestamp_mode,
|
| 496 |
)
|
| 497 |
|
|
|
|
| 499 |
logger.warning("⚠️ No viral segments found.")
|
| 500 |
return []
|
| 501 |
|
|
|
|
| 502 |
best_clips = processor.get_best_segments(viral_segments, duration)
|
| 503 |
|
| 504 |
# Phase 3: render
|
requirements.txt
CHANGED
|
@@ -15,5 +15,5 @@ imageio-ffmpeg==0.4.8
|
|
| 15 |
openai>=1.0.0
|
| 16 |
scipy
|
| 17 |
json_repair
|
| 18 |
-
|
| 19 |
-
|
|
|
|
| 15 |
openai>=1.0.0
|
| 16 |
scipy
|
| 17 |
json_repair
|
| 18 |
+
tiktoken
|
| 19 |
+
pydantic
|