Spaces:
Sleeping
Sleeping
Update streamlit_app.py
Browse files- streamlit_app.py +64 -321
streamlit_app.py
CHANGED
|
@@ -9,17 +9,18 @@ Video‑analysis Streamlit app
|
|
| 9 |
# Imports
|
| 10 |
# ----------------------------------------------------------------------
|
| 11 |
import base64, hashlib, os, string, traceback
|
| 12 |
-
import time
|
| 13 |
from pathlib import Path
|
| 14 |
from difflib import SequenceMatcher
|
| 15 |
from typing import Tuple, Optional
|
| 16 |
|
| 17 |
import ffmpeg
|
| 18 |
-
|
|
|
|
| 19 |
import requests
|
| 20 |
import streamlit as st
|
| 21 |
import yt_dlp
|
| 22 |
-
|
| 23 |
|
| 24 |
# ----------------------------------------------------------------------
|
| 25 |
# Constants & defaults
|
|
@@ -52,7 +53,7 @@ def _sanitize_filename(url: str) -> str:
|
|
| 52 |
name = Path(url.split("?")[0]).name.lower() # Remove query parameters before getting name
|
| 53 |
if not name: # Fallback if URL doesn't have a clear file name (e.g., youtube.com/watch?v=...)
|
| 54 |
name = "downloaded_video"
|
| 55 |
-
# Allow periods for extensions, but sanitize other punctuation
|
| 56 |
name = name.translate(str.maketrans("", "", string.punctuation.replace(".", ""))).replace(" ", "_")
|
| 57 |
return name
|
| 58 |
|
|
@@ -152,8 +153,6 @@ def _download_with_yt_dlp(url: str, dst: Path, password: str = "") -> Path:
|
|
| 152 |
"""
|
| 153 |
# ---------- yt_dlp options ----------
|
| 154 |
# Use a more specific template to avoid clashes and ensure proper naming
|
| 155 |
-
# %(title)s is often good, but can be long, so combining with %(id)s is safer.
|
| 156 |
-
# We'll sanitize this name later.
|
| 157 |
tmpl = str(dst / "%(id)s.%(ext)s")
|
| 158 |
ydl_opts = {
|
| 159 |
"outtmpl": tmpl,
|
|
@@ -250,45 +249,14 @@ def download_video(url: str, dst: Path, password: str = "") -> Path:
|
|
| 250 |
# Always ensure the destination directory exists
|
| 251 |
dst.mkdir(parents=True, exist_ok=True)
|
| 252 |
|
| 253 |
-
# Simple check for direct video file links
|
| 254 |
-
|
| 255 |
-
|
|
|
|
| 256 |
return _download_direct(url, dst)
|
| 257 |
|
| 258 |
-
#
|
| 259 |
-
|
| 260 |
-
tweet_id = url.split("/")[-1].split("?")[0]
|
| 261 |
-
try:
|
| 262 |
-
# Use the newer snscrape directly (get_items is an iterator)
|
| 263 |
-
scraper = sntwitter.TwitterTweetScraper(tweet_id)
|
| 264 |
-
found_video_url = None
|
| 265 |
-
for i, tweet in enumerate(scraper.get_items()):
|
| 266 |
-
if i > 0: # Only need to check the first tweet for its media
|
| 267 |
-
break
|
| 268 |
-
for m in getattr(tweet, "media", []):
|
| 269 |
-
if getattr(m, "video_url", None):
|
| 270 |
-
found_video_url = m.video_url
|
| 271 |
-
break
|
| 272 |
-
if found_video_url:
|
| 273 |
-
break
|
| 274 |
-
# Also check general URLs in the tweet for direct video links
|
| 275 |
-
for u in getattr(tweet, "urls", []):
|
| 276 |
-
if u.expandedUrl and u.expandedUrl.lower().endswith(video_exts):
|
| 277 |
-
found_video_url = u.expandedUrl
|
| 278 |
-
break
|
| 279 |
-
if found_video_url:
|
| 280 |
-
break
|
| 281 |
-
|
| 282 |
-
if found_video_url:
|
| 283 |
-
st.info(f"Found video URL in tweet: {found_video_url}")
|
| 284 |
-
return download_video(found_video_url, dst) # Recurse with the actual video URL
|
| 285 |
-
else:
|
| 286 |
-
raise RuntimeError("No direct video or video URL found in the tweet content.")
|
| 287 |
-
except Exception as e:
|
| 288 |
-
st.warning(f"Failed to scrape Twitter for video, trying yt-dlp: {e}")
|
| 289 |
-
# Fall through to yt_dlp if scraping fails
|
| 290 |
-
|
| 291 |
-
# Default to yt_dlp for most other cases
|
| 292 |
return _download_with_yt_dlp(url, dst, password)
|
| 293 |
|
| 294 |
|
|
@@ -376,17 +344,17 @@ def generate_report(
|
|
| 376 |
return "\n".join(parts)
|
| 377 |
|
| 378 |
|
| 379 |
-
def _strip_prompt_echo(prompt: str, text: str,
|
| 380 |
"""
|
| 381 |
Strips the prompt from the beginning of the generated text if it appears
|
| 382 |
-
as an echo, using difflib.SequenceMatcher for
|
| 383 |
|
| 384 |
Args:
|
| 385 |
prompt: The original prompt sent to the model.
|
| 386 |
text: The generated text from the model.
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
|
| 391 |
Returns:
|
| 392 |
The text with the prompt echo removed, or the original text if no echo
|
|
@@ -395,284 +363,60 @@ def _strip_prompt_echo(prompt: str, text: str, threshold: float = 0.68) -> str:
|
|
| 395 |
if not prompt or not text:
|
| 396 |
return text
|
| 397 |
|
| 398 |
-
# Normalize both prompt and text for comparison
|
| 399 |
clean_prompt = " ".join(prompt.lower().split()).strip()
|
| 400 |
clean_text = " ".join(text.lower().split()).strip()
|
| 401 |
|
| 402 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
matcher = SequenceMatcher(None, clean_prompt, clean_text)
|
|
|
|
| 404 |
match = matcher.find_longest_match(0, len(clean_prompt), 0, len(clean_text))
|
| 405 |
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
#
|
| 419 |
-
#
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
# Find the first occurrence of the prompt (or a significant part of it)
|
| 439 |
-
# This simple `find` might still be an issue with variations.
|
| 440 |
-
# Let's revert to a slightly more sophisticated startswith check for the original logic.
|
| 441 |
-
# If the original `text` actually starts with `prompt` (case-insensitive, after stripping),
|
| 442 |
-
# then remove it. This avoids issues with `SequenceMatcher` finding a match in the middle.
|
| 443 |
-
|
| 444 |
-
# Re-evaluate based on finding the prompt within the text itself for removal.
|
| 445 |
-
# We use `clean_text.find(clean_prompt_part_that_matched)` to find the start in clean_text
|
| 446 |
-
# and then infer the end.
|
| 447 |
-
|
| 448 |
-
# A simpler, more robust way for removal: If we are confident a prompt echo exists,
|
| 449 |
-
# attempt to remove the prompt itself and any leading punctuation/whitespace.
|
| 450 |
-
# The `SequenceMatcher` gives us confidence.
|
| 451 |
-
|
| 452 |
-
# Find the end position of the matched prompt segment within `clean_text`
|
| 453 |
-
# This approach is still a bit brittle due to varying whitespace/punc
|
| 454 |
-
# between `clean_text` and `text`.
|
| 455 |
-
|
| 456 |
-
# Let's use the match.size directly to infer removal from original `text`.
|
| 457 |
-
# If `clean_text` starts with a chunk of `clean_prompt` of `match.size` length,
|
| 458 |
-
# we want to remove the corresponding part from `text`.
|
| 459 |
-
# The most direct way is to remove the prompt itself from the beginning of `text`
|
| 460 |
-
# and then strip leading delimiters.
|
| 461 |
-
|
| 462 |
-
# A safer method for stripping after confirming a match:
|
| 463 |
-
# 1. Take the text.
|
| 464 |
-
# 2. Convert a prefix of the text (e.g., first `len(prompt) + 50` chars) to lower case.
|
| 465 |
-
# 3. Compare with lower case prompt using SequenceMatcher.
|
| 466 |
-
# 4. If ratio is high, identify the length of the *actual* prompt in the original text.
|
| 467 |
-
# This is hard.
|
| 468 |
-
|
| 469 |
-
# Alternative: If a high ratio is found for the start of `clean_text` matching `clean_prompt`,
|
| 470 |
-
# then assume the prompt is echoed. We will remove the *original* prompt,
|
| 471 |
-
# and then strip any leading non-alphanumeric characters.
|
| 472 |
-
|
| 473 |
-
# The original logic of `_strip_prompt_echo` was:
|
| 474 |
-
# `if lower_text.startswith(clean_prompt): return text[len(prompt):].lstrip(" \n:-")`
|
| 475 |
-
# This relied on an exact match of the prompt's *cleaned* version with the start of the *cleaned* text.
|
| 476 |
-
# `SequenceMatcher` improves the "startswith" check.
|
| 477 |
-
|
| 478 |
-
# If `SequenceMatcher` indicates a strong match at the beginning (`match.b == 0`),
|
| 479 |
-
# we remove the prompt text (case-insensitive) from the start of the *original* text.
|
| 480 |
-
|
| 481 |
-
# Try to find the prompt (case-insensitive) at the beginning of the text
|
| 482 |
-
prompt_lower = prompt.lower()
|
| 483 |
-
text_lower_prefix = text[:len(prompt) + 50].lower() # Check a reasonable prefix
|
| 484 |
-
|
| 485 |
-
# This finds the start of the prompt within the text_lower_prefix
|
| 486 |
-
# Using find can be problematic if text has leading junk.
|
| 487 |
-
# Instead, just remove the prompt itself if we deem it echoed.
|
| 488 |
-
|
| 489 |
-
# Given the high confidence from SequenceMatcher (`match_ratio >= threshold`),
|
| 490 |
-
# we can attempt to remove a string equivalent to the prompt from the beginning of `text`.
|
| 491 |
-
# Find the index of the prompt's normalized version in the normalized text.
|
| 492 |
-
# This is still not perfect for original `text` whitespace.
|
| 493 |
-
|
| 494 |
-
# Let's refine the removal: remove the prompt string itself and then strip.
|
| 495 |
-
# This is still susceptible to minor leading variations.
|
| 496 |
-
|
| 497 |
-
# Re-thinking to be robust: If `clean_text` matches `clean_prompt` up to `match.size`
|
| 498 |
-
# at its beginning (match.b == 0), then we should remove `text` up to the length
|
| 499 |
-
# that corresponds to `match.size` in `clean_text`.
|
| 500 |
-
|
| 501 |
-
# This means we need to map `match.size` characters of `clean_text` back to `text`.
|
| 502 |
-
# This is complex. A simpler, somewhat heuristic approach:
|
| 503 |
-
|
| 504 |
-
# If `clean_prompt` matches the beginning of `clean_text` (match.b == 0)
|
| 505 |
-
# and the match is long enough (`match_ratio >= threshold`),
|
| 506 |
-
# then it is likely the prompt was echoed.
|
| 507 |
-
# We want to remove *at least* the prompt from the start, plus any leading junk.
|
| 508 |
-
|
| 509 |
-
# The original logic (`text[len(prompt):].lstrip(" \n:-")`) is good for removal *given* a match.
|
| 510 |
-
# The `SequenceMatcher` provides a better "given a match" condition.
|
| 511 |
-
|
| 512 |
-
# Find the actual end of the matching part in the original `text`
|
| 513 |
-
# This is the tricky part. A heuristic:
|
| 514 |
-
# Iterate through `text` and `prompt` simultaneously, skipping whitespace/punctuation.
|
| 515 |
-
# Count how many characters of `text` correspond to the matched `prompt` characters.
|
| 516 |
-
|
| 517 |
-
# Let's try to find the full (or most of) prompt within `text` (case insensitive)
|
| 518 |
-
# and remove that.
|
| 519 |
-
|
| 520 |
-
# Find the actual segment of the prompt that matched in the *original* `prompt` string
|
| 521 |
-
matched_segment_in_prompt_original_case = prompt[match.a : match.a + match.size]
|
| 522 |
-
|
| 523 |
-
# Find the index of this segment in the original `text`, if it's at the beginning
|
| 524 |
-
idx_in_text = text.lower().find(matched_segment_in_prompt_original_case.lower())
|
| 525 |
-
|
| 526 |
-
if idx_in_text == 0: # If the matched segment appears at the very beginning of the original text
|
| 527 |
-
# Try to remove the actual prompt from the text.
|
| 528 |
-
# This could be slightly off if the model added characters *inside* the prompt echo.
|
| 529 |
-
# The safest bet: if we have a high confidence match, strip the *entire* prompt,
|
| 530 |
-
# then strip leading noise.
|
| 531 |
-
|
| 532 |
-
# Assume the model output the prompt, potentially with minor changes.
|
| 533 |
-
# Remove a portion of `text` that is roughly `len(prompt)` long,
|
| 534 |
-
# then clean up leading characters.
|
| 535 |
-
|
| 536 |
-
# A robust heuristic for removal after `SequenceMatcher` confirms echo:
|
| 537 |
-
# Remove characters from the start of `text` until we reach a point
|
| 538 |
-
# where the remaining `text` no longer significantly matches `prompt`.
|
| 539 |
-
|
| 540 |
-
# Given match_ratio is high, we can be aggressive.
|
| 541 |
-
# The simplest removal is `text[len(prompt):]`.
|
| 542 |
-
# Then apply the lstrip.
|
| 543 |
-
|
| 544 |
-
# Determine the end index in `text` that corresponds to the end of the `clean_prompt` match
|
| 545 |
-
end_idx_in_clean_text = match.size
|
| 546 |
-
|
| 547 |
-
# Convert the `clean_text` end index back to an original `text` index
|
| 548 |
-
# This is still problematic.
|
| 549 |
-
|
| 550 |
-
# Let's stick to the simplest removal if the `SequenceMatcher` gives confidence.
|
| 551 |
-
# Remove characters up to the prompt's length, then strip leading non-alphanumeric.
|
| 552 |
-
# This might cut off too much or too little if the model's echo deviates
|
| 553 |
-
# significantly in length.
|
| 554 |
-
|
| 555 |
-
# A more refined approach:
|
| 556 |
-
# If clean_prompt is "abc" and clean_text is "abc def", match.size=3.
|
| 557 |
-
# We need to remove 3 characters from `text` and then lstrip.
|
| 558 |
-
# If clean_prompt is "abc" and clean_text is "ABC DEF", match.size=3.
|
| 559 |
-
# We need to remove 3 characters from `text` and then lstrip.
|
| 560 |
-
|
| 561 |
-
# The `match.size` gives the length of the longest *common* subsequence.
|
| 562 |
-
# This does not directly translate to the length of the "echoed prompt" in `text`.
|
| 563 |
-
# `SequenceMatcher` is good for *detection*, but mapping `match.size` back to actual
|
| 564 |
-
# string indices for removal is complex for strings with different whitespace.
|
| 565 |
-
|
| 566 |
-
# Let's go with a pragmatic approach: if `SequenceMatcher` says there's a strong echo at the start,
|
| 567 |
-
# we will remove the exact `prompt` string (case-insensitively) if it's there,
|
| 568 |
-
# and then strip leading noise. This is still safer than `text[match.size:]` as
|
| 569 |
-
# `match.size` is often smaller than the prompt's actual length.
|
| 570 |
-
|
| 571 |
-
# Try to remove the actual prompt from the beginning of the text,
|
| 572 |
-
# allowing for whitespace and punctuation before it.
|
| 573 |
-
|
| 574 |
-
# Find the actual (case-insensitive) start of the prompt within the text
|
| 575 |
-
# by searching for the normalized prompt.
|
| 576 |
-
|
| 577 |
-
# If SequenceMatcher gives high confidence, attempt to remove `len(prompt)`
|
| 578 |
-
# characters from the beginning of `text`, then strip.
|
| 579 |
-
# This is a heuristic, but often works well.
|
| 580 |
-
|
| 581 |
-
# Given the match, remove a prefix of `text` corresponding to `len(prompt)`
|
| 582 |
-
# and then strip leading punctuation/whitespace.
|
| 583 |
-
# This might cut off more or less than the actual echoed prompt if there are
|
| 584 |
-
# length differences in the echo.
|
| 585 |
-
|
| 586 |
-
# A robust way to remove the "matched portion" without exact index mapping:
|
| 587 |
-
# If `clean_prompt` matches `clean_text` strongly at the beginning,
|
| 588 |
-
# it means `clean_text` starts with `clean_prompt` (or a very similar version).
|
| 589 |
-
# We can remove `prompt` + any leading garbage characters.
|
| 590 |
-
|
| 591 |
-
# Let's try removing characters until the remaining text's start is no longer
|
| 592 |
-
# strongly similar to the prompt.
|
| 593 |
-
|
| 594 |
-
# A simpler, direct approach if `SequenceMatcher` confirms a strong match:
|
| 595 |
-
# Find where the `clean_prompt` *would end* in `clean_text` if it were there.
|
| 596 |
-
|
| 597 |
-
# This is what `difflib` is for: `SequenceMatcher` (a,b) identifies differences.
|
| 598 |
-
# What we want is the index in `text` where the "echo" ends.
|
| 599 |
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
lower_text = text.lower()
|
| 607 |
-
lower_prompt = prompt.lower()
|
| 608 |
|
| 609 |
-
|
| 610 |
-
# If it's at the very beginning (index 0), then remove it and strip.
|
| 611 |
-
if lower_text.startswith(lower_prompt):
|
| 612 |
-
return text[len(prompt):].lstrip(" \n:-")
|
| 613 |
-
else:
|
| 614 |
-
# If the exact match doesn't work, but SequenceMatcher was confident,
|
| 615 |
-
# it means there were minor variations.
|
| 616 |
-
# We can try to remove text up to `match.size` from the start of the *original* text
|
| 617 |
-
# and then strip. This is still risky.
|
| 618 |
-
|
| 619 |
-
# Instead, if the `SequenceMatcher` confidence is high, and `clean_text` starts
|
| 620 |
-
# with the matched part, simply remove a fixed length from `text`
|
| 621 |
-
# that is roughly the length of the prompt, and then strip.
|
| 622 |
-
# This is the most practical.
|
| 623 |
-
|
| 624 |
-
# Estimate the end position of the echoed prompt in the original text
|
| 625 |
-
# based on the length of the clean prompt.
|
| 626 |
-
# This is a heuristic.
|
| 627 |
-
estimated_end_of_echo = len(prompt)
|
| 628 |
-
|
| 629 |
-
# Remove characters up to this estimated position, then strip leading garbage
|
| 630 |
-
remaining_text = text[estimated_end_of_echo:].lstrip(" \n:-")
|
| 631 |
-
|
| 632 |
-
# If the remaining text is significantly shorter than original and still looks like it
|
| 633 |
-
# might have started with the prompt, this is a good guess.
|
| 634 |
-
# If this cut too much, it's problematic.
|
| 635 |
-
|
| 636 |
-
# Let's try removing characters from the start of `text` one by one,
|
| 637 |
-
# until the `SequenceMatcher` similarity with `prompt` drops below a threshold.
|
| 638 |
-
# This is computationally more expensive but more accurate for removal.
|
| 639 |
-
|
| 640 |
-
# A simpler, more direct implementation using the `SequenceMatcher` for *detection*
|
| 641 |
-
# and then a careful string removal:
|
| 642 |
-
# Remove the portion of `text` that corresponds to the `match.size` found by `SequenceMatcher`
|
| 643 |
-
# from the beginning of `clean_text`, and then map that length back to `text`.
|
| 644 |
-
|
| 645 |
-
# This is the most robust way to remove if `match.b == 0` (starts at beginning):
|
| 646 |
-
# We have `clean_text[0 : match.size]` which is `clean_prompt[match.a : match.a + match.size]`
|
| 647 |
-
# We need to find the equivalent `len` in the original `text`.
|
| 648 |
-
|
| 649 |
-
# This is a known hard problem. Let's simplify.
|
| 650 |
-
# If `SequenceMatcher` is confident (`match_ratio >= threshold`),
|
| 651 |
-
# we will remove the actual `prompt` string (case-insensitive),
|
| 652 |
-
# and then clean up.
|
| 653 |
-
|
| 654 |
-
# Revert to a simpler 'startswith' for removal, but use the `SequenceMatcher` for the *condition*.
|
| 655 |
-
# If the `SequenceMatcher` detected a match, it means `text` likely starts with `prompt`.
|
| 656 |
-
# Then we can apply the `startswith` logic for removal.
|
| 657 |
-
|
| 658 |
-
# Find the first occurrence of `clean_prompt` in `clean_text`
|
| 659 |
-
idx_start = clean_text.find(clean_prompt)
|
| 660 |
-
if idx_start == 0:
|
| 661 |
-
# If the clean prompt is found at the start of the clean text,
|
| 662 |
-
# remove the original prompt length from the original text.
|
| 663 |
-
# This is a heuristic that works well if prompt is echoed cleanly.
|
| 664 |
-
return text[len(prompt):].lstrip(" \n:-")
|
| 665 |
-
else:
|
| 666 |
-
# If the clean prompt itself isn't at the start, but SequenceMatcher
|
| 667 |
-
# found a strong match (e.g., "prompt: <prompt content>" vs "Prompt: <prompt content>"),
|
| 668 |
-
# we still want to remove it.
|
| 669 |
-
# The `match.size` tells us how much of `clean_prompt` matched.
|
| 670 |
-
# If `match.b == 0`, it means `clean_text` starts with a chunk of `clean_prompt`.
|
| 671 |
-
# We can try to remove the *length* of `clean_prompt` from `text`.
|
| 672 |
-
# This is a bit brute force but avoids complex mapping.
|
| 673 |
-
return text[len(clean_prompt):].lstrip(" \n:-")
|
| 674 |
-
|
| 675 |
-
# If no significant match at the beginning, return original text
|
| 676 |
return text
|
| 677 |
|
| 678 |
|
|
@@ -703,7 +447,7 @@ def _init_state() -> None:
|
|
| 703 |
"video_path": "",
|
| 704 |
"model_input": DEFAULT_MODEL,
|
| 705 |
"prompt": DEFAULT_PROMPT,
|
| 706 |
-
"api_key": os.getenv("GOOGLE_API_KEY", ""),
|
| 707 |
"video_password": "",
|
| 708 |
"compress_mb": 200,
|
| 709 |
"busy": False,
|
|
@@ -738,7 +482,7 @@ def main() -> None:
|
|
| 738 |
"Compress if > (MB)",
|
| 739 |
min_value=10,
|
| 740 |
max_value=2000,
|
| 741 |
-
value=st.session_state
|
| 742 |
step=10,
|
| 743 |
key="compress_mb",
|
| 744 |
)
|
|
@@ -888,5 +632,4 @@ def main() -> None:
|
|
| 888 |
# Entry point
|
| 889 |
# ----------------------------------------------------------------------
|
| 890 |
if __name__ == "__main__":
|
| 891 |
-
# No need to call _init_state() here – it is invoked inside main()
|
| 892 |
main()
|
|
|
|
| 9 |
# Imports
|
| 10 |
# ----------------------------------------------------------------------
|
| 11 |
import base64, hashlib, os, string, traceback
|
| 12 |
+
import time
|
| 13 |
from pathlib import Path
|
| 14 |
from difflib import SequenceMatcher
|
| 15 |
from typing import Tuple, Optional
|
| 16 |
|
| 17 |
import ffmpeg
|
| 18 |
+
# Changed from google.generativeai as genai to google.genai
|
| 19 |
+
import google.genai as genai
|
| 20 |
import requests
|
| 21 |
import streamlit as st
|
| 22 |
import yt_dlp
|
| 23 |
+
# Removed snscrape.modules.twitter as sntwitter due to errors and user request
|
| 24 |
|
| 25 |
# ----------------------------------------------------------------------
|
| 26 |
# Constants & defaults
|
|
|
|
| 53 |
name = Path(url.split("?")[0]).name.lower() # Remove query parameters before getting name
|
| 54 |
if not name: # Fallback if URL doesn't have a clear file name (e.g., youtube.com/watch?v=...)
|
| 55 |
name = "downloaded_video"
|
| 56 |
+
# Allow periods for extensions, but sanitize other punctuation (except periods)
|
| 57 |
name = name.translate(str.maketrans("", "", string.punctuation.replace(".", ""))).replace(" ", "_")
|
| 58 |
return name
|
| 59 |
|
|
|
|
| 153 |
"""
|
| 154 |
# ---------- yt_dlp options ----------
|
| 155 |
# Use a more specific template to avoid clashes and ensure proper naming
|
|
|
|
|
|
|
| 156 |
tmpl = str(dst / "%(id)s.%(ext)s")
|
| 157 |
ydl_opts = {
|
| 158 |
"outtmpl": tmpl,
|
|
|
|
| 249 |
# Always ensure the destination directory exists
|
| 250 |
dst.mkdir(parents=True, exist_ok=True)
|
| 251 |
|
| 252 |
+
# Simple check for direct video file links (e.g., raw .mp4 link)
|
| 253 |
+
# Exclude common platforms that yt-dlp handles better even if they look like direct links
|
| 254 |
+
if url.lower().endswith(video_exts) and not any(platform in url for platform in ["youtube.com", "vimeo.com"]):
|
| 255 |
+
st.info(f"Attempting direct download for URL: {url}")
|
| 256 |
return _download_direct(url, dst)
|
| 257 |
|
| 258 |
+
# Default to yt_dlp for all other cases (e.g., YouTube, Vimeo, generic pages that yt_dlp can parse)
|
| 259 |
+
st.info(f"Attempting download with yt-dlp for URL: {url}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
return _download_with_yt_dlp(url, dst, password)
|
| 261 |
|
| 262 |
|
|
|
|
| 344 |
return "\n".join(parts)
|
| 345 |
|
| 346 |
|
| 347 |
+
def _strip_prompt_echo(prompt: str, text: str, similarity_threshold: float = 0.68) -> str:
|
| 348 |
"""
|
| 349 |
Strips the prompt from the beginning of the generated text if it appears
|
| 350 |
+
as an echo, using difflib.SequenceMatcher for robust matching.
|
| 351 |
|
| 352 |
Args:
|
| 353 |
prompt: The original prompt sent to the model.
|
| 354 |
text: The generated text from the model.
|
| 355 |
+
similarity_threshold: The similarity ratio (0.0 to 1.0) required for a match.
|
| 356 |
+
A value of 0.68 means at least 68% of the prompt must be
|
| 357 |
+
present at the beginning of the text to be considered an echo.
|
| 358 |
|
| 359 |
Returns:
|
| 360 |
The text with the prompt echo removed, or the original text if no echo
|
|
|
|
| 363 |
if not prompt or not text:
|
| 364 |
return text
|
| 365 |
|
| 366 |
+
# Normalize both prompt and text for robust comparison (lowercase, single spaces)
|
| 367 |
clean_prompt = " ".join(prompt.lower().split()).strip()
|
| 368 |
clean_text = " ".join(text.lower().split()).strip()
|
| 369 |
|
| 370 |
+
# Avoid processing if clean_prompt is much larger than clean_text,
|
| 371 |
+
# or if either is empty after cleaning
|
| 372 |
+
if not clean_prompt or not clean_text or len(clean_prompt) > len(clean_text) * 2:
|
| 373 |
+
return text
|
| 374 |
+
|
| 375 |
+
# Use SequenceMatcher to find the longest matching block at the beginning
|
| 376 |
matcher = SequenceMatcher(None, clean_prompt, clean_text)
|
| 377 |
+
# `match.b == 0` ensures the match starts at the very beginning of `clean_text`.
|
| 378 |
match = matcher.find_longest_match(0, len(clean_prompt), 0, len(clean_text))
|
| 379 |
|
| 380 |
+
if match.b == 0 and match.size > 0: # If a match starts at the beginning of the generated text
|
| 381 |
+
# Calculate the ratio of the matched segment to the *entire* prompt length.
|
| 382 |
+
match_ratio = match.size / len(clean_prompt)
|
| 383 |
+
|
| 384 |
+
if match_ratio >= similarity_threshold:
|
| 385 |
+
# High confidence that the prompt (or a very similar version)
|
| 386 |
+
# is echoed at the beginning of the generated text.
|
| 387 |
+
# Now, attempt to remove the echoed part from the original `text`.
|
| 388 |
+
|
| 389 |
+
original_text_idx = 0
|
| 390 |
+
original_prompt_idx = 0
|
| 391 |
+
|
| 392 |
+
# Iterate through both original strings, attempting to match characters
|
| 393 |
+
# while being tolerant of leading whitespace and punctuation in the text.
|
| 394 |
+
while original_text_idx < len(text) and original_prompt_idx < len(prompt):
|
| 395 |
+
char_text = text[original_text_idx]
|
| 396 |
+
char_prompt = prompt[original_prompt_idx]
|
| 397 |
+
|
| 398 |
+
if char_text.lower() == char_prompt.lower():
|
| 399 |
+
# Characters match (case-insensitively), advance both pointers
|
| 400 |
+
original_text_idx += 1
|
| 401 |
+
original_prompt_idx += 1
|
| 402 |
+
elif char_text.isspace() or char_text in string.punctuation:
|
| 403 |
+
# Current char in text is whitespace or punctuation,
|
| 404 |
+
# and it's not matching the current prompt char.
|
| 405 |
+
# Assume it's leading noise from the model's output; consume it.
|
| 406 |
+
original_text_idx += 1
|
| 407 |
+
else:
|
| 408 |
+
# Found a significant mismatch that isn't just whitespace/punctuation
|
| 409 |
+
# or the prompt ended. Stop matching.
|
| 410 |
+
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
|
| 412 |
+
# If a substantial portion of the prompt was "consumed" by this process,
|
| 413 |
+
# then we consider the prompt to have been echoed.
|
| 414 |
+
# Return the rest of the text, further stripping any residual leading
|
| 415 |
+
# whitespace/punctuation that the loop might have missed.
|
| 416 |
+
if original_prompt_idx / len(prompt) >= similarity_threshold:
|
| 417 |
+
return text[original_text_idx:].lstrip(" \n:-")
|
|
|
|
|
|
|
| 418 |
|
| 419 |
+
# If no significant match at the beginning, or threshold not met, return original text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
return text
|
| 421 |
|
| 422 |
|
|
|
|
| 447 |
"video_path": "",
|
| 448 |
"model_input": DEFAULT_MODEL,
|
| 449 |
"prompt": DEFAULT_PROMPT,
|
| 450 |
+
"api_key": os.getenv("GOOGLE_API_KEY", ""),
|
| 451 |
"video_password": "",
|
| 452 |
"compress_mb": 200,
|
| 453 |
"busy": False,
|
|
|
|
| 482 |
"Compress if > (MB)",
|
| 483 |
min_value=10,
|
| 484 |
max_value=2000,
|
| 485 |
+
value=st.session_state["compress_mb"], # Simplified from .get()
|
| 486 |
step=10,
|
| 487 |
key="compress_mb",
|
| 488 |
)
|
|
|
|
| 632 |
# Entry point
|
| 633 |
# ----------------------------------------------------------------------
|
| 634 |
if __name__ == "__main__":
|
|
|
|
| 635 |
main()
|