Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -20,32 +20,64 @@ FORMATTING RULES (STRICT):
|
|
| 20 |
"""
|
| 21 |
|
| 22 |
def parse_script(full_text):
|
| 23 |
-
#
|
| 24 |
-
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
clean = clean.replace("**", "").replace("*", "").replace("__", "")
|
| 32 |
-
# Remove common metadata words/colons at start of lines (e.g., "Hook:", "Intro:")
|
| 33 |
-
clean = re.sub(r'^(Hook|Intro|Body|CTA|Conclusion|Outro):\s*', '', clean, flags=re.IGNORECASE | re.MULTILINE).strip()
|
| 34 |
-
if clean:
|
| 35 |
-
cleaned_dialogue.append(clean)
|
| 36 |
-
|
| 37 |
-
clean_script = "\n\n".join(cleaned_dialogue)
|
| 38 |
|
| 39 |
-
|
| 40 |
-
scene_parts = re.findall(r'\[?SCENE DESCRIPTION\]?:?\s*(.*?)(?=\[?SCRIPT\]?|$)', full_text, re.DOTALL | re.IGNORECASE)
|
| 41 |
-
clean_scenes = "\n".join([p.strip().replace("**", "") for p in scene_parts if p.strip()])
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
# If no tags, try to strip common AI intro fluff
|
| 46 |
-
clean_script = re.sub(r'^(Here is|Sure|Okay|I can help|Youtube Script).*?$', '', full_text, flags=re.IGNORECASE | re.MULTILINE).strip()
|
| 47 |
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
def save_to_file(script_text):
|
| 51 |
if not script_text:
|
|
|
|
| 20 |
"""
|
| 21 |
|
| 22 |
def parse_script(full_text):
|
| 23 |
+
# Use regular expression to find all instances of the tags
|
| 24 |
+
# This pattern matches [SCRIPT] or [SCENE DESCRIPTION] with optional brackets and colons
|
| 25 |
+
tag_pattern = r'(\[?(?:SCRIPT|SCENE DESCRIPTION)\]?:?)'
|
| 26 |
|
| 27 |
+
# Split the text by the tags
|
| 28 |
+
parts = re.split(tag_pattern, full_text, flags=re.IGNORECASE)
|
| 29 |
+
|
| 30 |
+
clean_script = []
|
| 31 |
+
clean_scenes = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
+
current_tag = None
|
|
|
|
|
|
|
| 34 |
|
| 35 |
+
for part in parts:
|
| 36 |
+
lower_part = part.lower().strip()
|
|
|
|
|
|
|
| 37 |
|
| 38 |
+
# Check if the current part is a tag
|
| 39 |
+
if "script" in lower_part and ("[" in lower_part or "script" == lower_part.strip("[]:")):
|
| 40 |
+
current_tag = "SCRIPT"
|
| 41 |
+
elif "scene" in lower_part and ("[" in lower_part or "scene" in lower_part.strip("[]:")):
|
| 42 |
+
current_tag = "SCENE"
|
| 43 |
+
elif current_tag:
|
| 44 |
+
# This is the content following a tag
|
| 45 |
+
content = part.strip()
|
| 46 |
+
if not content:
|
| 47 |
+
continue
|
| 48 |
+
|
| 49 |
+
if current_tag == "SCRIPT":
|
| 50 |
+
# Agressive cleaning for TTS:
|
| 51 |
+
# 1. Remove markdown headers
|
| 52 |
+
content = re.sub(r'^#+.*$', '', content, flags=re.MULTILINE)
|
| 53 |
+
# 2. Remove common metadata labels at the start of lines
|
| 54 |
+
content = re.sub(r'^(Hook|Intro|Body|CTA|Conclusion|Outro|Segment|Step \d+):\s*', '', content, flags=re.IGNORECASE | re.MULTILINE)
|
| 55 |
+
# 3. Remove text in parentheses (visual cues/directions)
|
| 56 |
+
content = re.sub(r'\(.*?\)', '', content, flags=re.DOTALL)
|
| 57 |
+
# 4. Remove brackets like [Upbeat Music] that might be inside a script tag
|
| 58 |
+
content = re.sub(r'\[.*?\]', '', content, flags=re.DOTALL)
|
| 59 |
+
# 5. Remove bolding/formatting
|
| 60 |
+
content = content.replace("**", "").replace("*", "").replace("__", "")
|
| 61 |
+
|
| 62 |
+
final_content = content.strip()
|
| 63 |
+
if final_content:
|
| 64 |
+
clean_script.append(final_content)
|
| 65 |
+
|
| 66 |
+
elif current_tag == "SCENE":
|
| 67 |
+
clean_scenes.append(content)
|
| 68 |
+
|
| 69 |
+
# Join the cleaned parts
|
| 70 |
+
script_text = "\n\n".join(clean_script)
|
| 71 |
+
scenes_text = "\n\n".join(clean_scenes)
|
| 72 |
+
|
| 73 |
+
# Final cleanup of the script text to ensure no lingering "ION]" or empty lines
|
| 74 |
+
script_text = re.sub(r'^ION\]\s*', '', script_text, flags=re.IGNORECASE | re.MULTILINE)
|
| 75 |
+
|
| 76 |
+
# Fallback: if no script was found, use the clean version of the full text
|
| 77 |
+
if not script_text and full_text:
|
| 78 |
+
script_text = re.sub(r'\[.*?\]', '', full_text, flags=re.DOTALL).strip()
|
| 79 |
+
|
| 80 |
+
return script_text, scenes_text
|
| 81 |
|
| 82 |
def save_to_file(script_text):
|
| 83 |
if not script_text:
|