Hug0endob commited on
Commit
7c962d2
·
verified ·
1 Parent(s): 68431e8

Update streamlit_app.py

Browse files
Files changed (1) hide show
  1. streamlit_app.py +64 -321
streamlit_app.py CHANGED
@@ -9,17 +9,18 @@ Video‑analysis Streamlit app
9
  # Imports
10
  # ----------------------------------------------------------------------
11
  import base64, hashlib, os, string, traceback
12
- import time # Added for fallback filename in _download_with_yt_dlp
13
  from pathlib import Path
14
  from difflib import SequenceMatcher
15
  from typing import Tuple, Optional
16
 
17
  import ffmpeg
18
- import google.generativeai as genai
 
19
  import requests
20
  import streamlit as st
21
  import yt_dlp
22
- import snscrape.modules.twitter as sntwitter
23
 
24
  # ----------------------------------------------------------------------
25
  # Constants & defaults
@@ -52,7 +53,7 @@ def _sanitize_filename(url: str) -> str:
52
  name = Path(url.split("?")[0]).name.lower() # Remove query parameters before getting name
53
  if not name: # Fallback if URL doesn't have a clear file name (e.g., youtube.com/watch?v=...)
54
  name = "downloaded_video"
55
- # Allow periods for extensions, but sanitize other punctuation
56
  name = name.translate(str.maketrans("", "", string.punctuation.replace(".", ""))).replace(" ", "_")
57
  return name
58
 
@@ -152,8 +153,6 @@ def _download_with_yt_dlp(url: str, dst: Path, password: str = "") -> Path:
152
  """
153
  # ---------- yt_dlp options ----------
154
  # Use a more specific template to avoid clashes and ensure proper naming
155
- # %(title)s is often good, but can be long, so combining with %(id)s is safer.
156
- # We'll sanitize this name later.
157
  tmpl = str(dst / "%(id)s.%(ext)s")
158
  ydl_opts = {
159
  "outtmpl": tmpl,
@@ -250,45 +249,14 @@ def download_video(url: str, dst: Path, password: str = "") -> Path:
250
  # Always ensure the destination directory exists
251
  dst.mkdir(parents=True, exist_ok=True)
252
 
253
- # Simple check for direct video file links
254
- if url.lower().endswith(video_exts) and not any(platform in url for platform in ["youtube.com", "twitter.com", "vimeo.com"]):
255
- # Use direct download for simple file links if not a known platform yt_dlp handles better
 
256
  return _download_direct(url, dst)
257
 
258
- # Handle Twitter URLs specifically
259
- if "twitter.com" in url and "/status/" in url:
260
- tweet_id = url.split("/")[-1].split("?")[0]
261
- try:
262
- # Use the newer snscrape directly (get_items is an iterator)
263
- scraper = sntwitter.TwitterTweetScraper(tweet_id)
264
- found_video_url = None
265
- for i, tweet in enumerate(scraper.get_items()):
266
- if i > 0: # Only need to check the first tweet for its media
267
- break
268
- for m in getattr(tweet, "media", []):
269
- if getattr(m, "video_url", None):
270
- found_video_url = m.video_url
271
- break
272
- if found_video_url:
273
- break
274
- # Also check general URLs in the tweet for direct video links
275
- for u in getattr(tweet, "urls", []):
276
- if u.expandedUrl and u.expandedUrl.lower().endswith(video_exts):
277
- found_video_url = u.expandedUrl
278
- break
279
- if found_video_url:
280
- break
281
-
282
- if found_video_url:
283
- st.info(f"Found video URL in tweet: {found_video_url}")
284
- return download_video(found_video_url, dst) # Recurse with the actual video URL
285
- else:
286
- raise RuntimeError("No direct video or video URL found in the tweet content.")
287
- except Exception as e:
288
- st.warning(f"Failed to scrape Twitter for video, trying yt-dlp: {e}")
289
- # Fall through to yt_dlp if scraping fails
290
-
291
- # Default to yt_dlp for most other cases
292
  return _download_with_yt_dlp(url, dst, password)
293
 
294
 
@@ -376,17 +344,17 @@ def generate_report(
376
  return "\n".join(parts)
377
 
378
 
379
- def _strip_prompt_echo(prompt: str, text: str, threshold: float = 0.68) -> str:
380
  """
381
  Strips the prompt from the beginning of the generated text if it appears
382
- as an echo, using difflib.SequenceMatcher for more robust matching.
383
 
384
  Args:
385
  prompt: The original prompt sent to the model.
386
  text: The generated text from the model.
387
- threshold: The similarity ratio (0.0 to 1.0) required for a match.
388
- A value of 0.68 means at least 68% of the prompt must be
389
- present at the beginning of the text to be considered an echo.
390
 
391
  Returns:
392
  The text with the prompt echo removed, or the original text if no echo
@@ -395,284 +363,60 @@ def _strip_prompt_echo(prompt: str, text: str, threshold: float = 0.68) -> str:
395
  if not prompt or not text:
396
  return text
397
 
398
- # Normalize both prompt and text for comparison: lowercase, single spaces
399
  clean_prompt = " ".join(prompt.lower().split()).strip()
400
  clean_text = " ".join(text.lower().split()).strip()
401
 
402
- # Find the longest matching block at the beginning of the text
 
 
 
 
 
403
  matcher = SequenceMatcher(None, clean_prompt, clean_text)
 
404
  match = matcher.find_longest_match(0, len(clean_prompt), 0, len(clean_text))
405
 
406
- # Check if a significant portion of the prompt matches the beginning of the text
407
- # s1[match.a : match.a + match.size] is the part of clean_prompt that matches
408
- # s2[match.b : match.b + match.size] is the part of clean_text that matches
409
- # We are interested if clean_text starts with a match to clean_prompt.
410
- if match.b == 0 and match.size > 0:
411
- matched_prompt_segment = clean_prompt[match.a : match.a + match.size]
412
- # Calculate ratio of matched segment to the *entire* prompt
413
- # This is more accurate than matcher.ratio() which compares full strings
414
- match_ratio = len(matched_prompt_segment) / len(clean_prompt) if len(clean_prompt) > 0 else 0
415
-
416
- if match_ratio >= threshold:
417
- # Determine the actual length in the original 'text' to remove
418
- # This is tricky because of original casing and whitespace.
419
- # A simple approach is to remove the prompt part from the original `text`
420
- # by finding where the *cleaned* matched segment ends in the *cleaned* text,
421
- # then using that position in the original `text`.
422
-
423
- # Simpler: if we match a large part of the prompt at the beginning of clean_text,
424
- # assume the original prompt appears at the start of original text and try to strip it.
425
- # This might not be perfectly robust to whitespace differences, but better than nothing.
426
-
427
- # Find the position where the matched prompt segment ends in the original `text`
428
- # This is still heuristic, but tries to remove up to the full prompt length if it's there
429
-
430
- # Instead of trying to find exact index after cleaning and then mapping back,
431
- # which is complex, we can simply remove the prompt and any leading delimiters
432
- # if a high enough similarity is found at the start.
433
-
434
- # Try to find the prompt in the original text, case-insensitively, and remove
435
- lower_text_original = text.lower()
436
- lower_prompt_original = prompt.lower()
437
-
438
- # Find the first occurrence of the prompt (or a significant part of it)
439
- # This simple `find` might still be an issue with variations.
440
- # Let's revert to a slightly more sophisticated startswith check for the original logic.
441
- # If the original `text` actually starts with `prompt` (case-insensitive, after stripping),
442
- # then remove it. This avoids issues with `SequenceMatcher` finding a match in the middle.
443
-
444
- # Re-evaluate based on finding the prompt within the text itself for removal.
445
- # We use `clean_text.find(clean_prompt_part_that_matched)` to find the start in clean_text
446
- # and then infer the end.
447
-
448
- # A simpler, more robust way for removal: If we are confident a prompt echo exists,
449
- # attempt to remove the prompt itself and any leading punctuation/whitespace.
450
- # The `SequenceMatcher` gives us confidence.
451
-
452
- # Find the end position of the matched prompt segment within `clean_text`
453
- # This approach is still a bit brittle due to varying whitespace/punc
454
- # between `clean_text` and `text`.
455
-
456
- # Let's use the match.size directly to infer removal from original `text`.
457
- # If `clean_text` starts with a chunk of `clean_prompt` of `match.size` length,
458
- # we want to remove the corresponding part from `text`.
459
- # The most direct way is to remove the prompt itself from the beginning of `text`
460
- # and then strip leading delimiters.
461
-
462
- # A safer method for stripping after confirming a match:
463
- # 1. Take the text.
464
- # 2. Convert a prefix of the text (e.g., first `len(prompt) + 50` chars) to lower case.
465
- # 3. Compare with lower case prompt using SequenceMatcher.
466
- # 4. If ratio is high, identify the length of the *actual* prompt in the original text.
467
- # This is hard.
468
-
469
- # Alternative: If a high ratio is found for the start of `clean_text` matching `clean_prompt`,
470
- # then assume the prompt is echoed. We will remove the *original* prompt,
471
- # and then strip any leading non-alphanumeric characters.
472
-
473
- # The original logic of `_strip_prompt_echo` was:
474
- # `if lower_text.startswith(clean_prompt): return text[len(prompt):].lstrip(" \n:-")`
475
- # This relied on an exact match of the prompt's *cleaned* version with the start of the *cleaned* text.
476
- # `SequenceMatcher` improves the "startswith" check.
477
-
478
- # If `SequenceMatcher` indicates a strong match at the beginning (`match.b == 0`),
479
- # we remove the prompt text (case-insensitive) from the start of the *original* text.
480
-
481
- # Try to find the prompt (case-insensitive) at the beginning of the text
482
- prompt_lower = prompt.lower()
483
- text_lower_prefix = text[:len(prompt) + 50].lower() # Check a reasonable prefix
484
-
485
- # This finds the start of the prompt within the text_lower_prefix
486
- # Using find can be problematic if text has leading junk.
487
- # Instead, just remove the prompt itself if we deem it echoed.
488
-
489
- # Given the high confidence from SequenceMatcher (`match_ratio >= threshold`),
490
- # we can attempt to remove a string equivalent to the prompt from the beginning of `text`.
491
- # Find the index of the prompt's normalized version in the normalized text.
492
- # This is still not perfect for original `text` whitespace.
493
-
494
- # Let's refine the removal: remove the prompt string itself and then strip.
495
- # This is still susceptible to minor leading variations.
496
-
497
- # Re-thinking to be robust: If `clean_text` matches `clean_prompt` up to `match.size`
498
- # at its beginning (match.b == 0), then we should remove `text` up to the length
499
- # that corresponds to `match.size` in `clean_text`.
500
-
501
- # This means we need to map `match.size` characters of `clean_text` back to `text`.
502
- # This is complex. A simpler, somewhat heuristic approach:
503
-
504
- # If `clean_prompt` matches the beginning of `clean_text` (match.b == 0)
505
- # and the match is long enough (`match_ratio >= threshold`),
506
- # then it is likely the prompt was echoed.
507
- # We want to remove *at least* the prompt from the start, plus any leading junk.
508
-
509
- # The original logic (`text[len(prompt):].lstrip(" \n:-")`) is good for removal *given* a match.
510
- # The `SequenceMatcher` provides a better "given a match" condition.
511
-
512
- # Find the actual end of the matching part in the original `text`
513
- # This is the tricky part. A heuristic:
514
- # Iterate through `text` and `prompt` simultaneously, skipping whitespace/punctuation.
515
- # Count how many characters of `text` correspond to the matched `prompt` characters.
516
-
517
- # Let's try to find the full (or most of) prompt within `text` (case insensitive)
518
- # and remove that.
519
-
520
- # Find the actual segment of the prompt that matched in the *original* `prompt` string
521
- matched_segment_in_prompt_original_case = prompt[match.a : match.a + match.size]
522
-
523
- # Find the index of this segment in the original `text`, if it's at the beginning
524
- idx_in_text = text.lower().find(matched_segment_in_prompt_original_case.lower())
525
-
526
- if idx_in_text == 0: # If the matched segment appears at the very beginning of the original text
527
- # Try to remove the actual prompt from the text.
528
- # This could be slightly off if the model added characters *inside* the prompt echo.
529
- # The safest bet: if we have a high confidence match, strip the *entire* prompt,
530
- # then strip leading noise.
531
-
532
- # Assume the model output the prompt, potentially with minor changes.
533
- # Remove a portion of `text` that is roughly `len(prompt)` long,
534
- # then clean up leading characters.
535
-
536
- # A robust heuristic for removal after `SequenceMatcher` confirms echo:
537
- # Remove characters from the start of `text` until we reach a point
538
- # where the remaining `text` no longer significantly matches `prompt`.
539
-
540
- # Given match_ratio is high, we can be aggressive.
541
- # The simplest removal is `text[len(prompt):]`.
542
- # Then apply the lstrip.
543
-
544
- # Determine the end index in `text` that corresponds to the end of the `clean_prompt` match
545
- end_idx_in_clean_text = match.size
546
-
547
- # Convert the `clean_text` end index back to an original `text` index
548
- # This is still problematic.
549
-
550
- # Let's stick to the simplest removal if the `SequenceMatcher` gives confidence.
551
- # Remove characters up to the prompt's length, then strip leading non-alphanumeric.
552
- # This might cut off too much or too little if the model's echo deviates
553
- # significantly in length.
554
-
555
- # A more refined approach:
556
- # If clean_prompt is "abc" and clean_text is "abc def", match.size=3.
557
- # We need to remove 3 characters from `text` and then lstrip.
558
- # If clean_prompt is "abc" and clean_text is "ABC DEF", match.size=3.
559
- # We need to remove 3 characters from `text` and then lstrip.
560
-
561
- # The `match.size` gives the length of the longest *common* subsequence.
562
- # This does not directly translate to the length of the "echoed prompt" in `text`.
563
- # `SequenceMatcher` is good for *detection*, but mapping `match.size` back to actual
564
- # string indices for removal is complex for strings with different whitespace.
565
-
566
- # Let's go with a pragmatic approach: if `SequenceMatcher` says there's a strong echo at the start,
567
- # we will remove the exact `prompt` string (case-insensitively) if it's there,
568
- # and then strip leading noise. This is still safer than `text[match.size:]` as
569
- # `match.size` is often smaller than the prompt's actual length.
570
-
571
- # Try to remove the actual prompt from the beginning of the text,
572
- # allowing for whitespace and punctuation before it.
573
-
574
- # Find the actual (case-insensitive) start of the prompt within the text
575
- # by searching for the normalized prompt.
576
-
577
- # If SequenceMatcher gives high confidence, attempt to remove `len(prompt)`
578
- # characters from the beginning of `text`, then strip.
579
- # This is a heuristic, but often works well.
580
-
581
- # Given the match, remove a prefix of `text` corresponding to `len(prompt)`
582
- # and then strip leading punctuation/whitespace.
583
- # This might cut off more or less than the actual echoed prompt if there are
584
- # length differences in the echo.
585
-
586
- # A robust way to remove the "matched portion" without exact index mapping:
587
- # If `clean_prompt` matches `clean_text` strongly at the beginning,
588
- # it means `clean_text` starts with `clean_prompt` (or a very similar version).
589
- # We can remove `prompt` + any leading garbage characters.
590
-
591
- # Let's try removing characters until the remaining text's start is no longer
592
- # strongly similar to the prompt.
593
-
594
- # A simpler, direct approach if `SequenceMatcher` confirms a strong match:
595
- # Find where the `clean_prompt` *would end* in `clean_text` if it were there.
596
-
597
- # This is what `difflib` is for: `SequenceMatcher` (a,b) identifies differences.
598
- # What we want is the index in `text` where the "echo" ends.
599
 
600
- # The prompt is usually "Prompt: <actual prompt>".
601
- # If the model echoes the prompt, it usually starts with "Prompt: <actual prompt>".
602
- # So we can remove `prompt` and then strip leading characters.
603
- # The `SequenceMatcher` logic means we found a high similarity.
604
-
605
- # Try finding the exact (case-insensitive) prompt in the text
606
- lower_text = text.lower()
607
- lower_prompt = prompt.lower()
608
 
609
- # Find the first occurrence of the lowercased prompt in the lowercased text
610
- # If it's at the very beginning (index 0), then remove it and strip.
611
- if lower_text.startswith(lower_prompt):
612
- return text[len(prompt):].lstrip(" \n:-")
613
- else:
614
- # If the exact match doesn't work, but SequenceMatcher was confident,
615
- # it means there were minor variations.
616
- # We can try to remove text up to `match.size` from the start of the *original* text
617
- # and then strip. This is still risky.
618
-
619
- # Instead, if the `SequenceMatcher` confidence is high, and `clean_text` starts
620
- # with the matched part, simply remove a fixed length from `text`
621
- # that is roughly the length of the prompt, and then strip.
622
- # This is the most practical.
623
-
624
- # Estimate the end position of the echoed prompt in the original text
625
- # based on the length of the clean prompt.
626
- # This is a heuristic.
627
- estimated_end_of_echo = len(prompt)
628
-
629
- # Remove characters up to this estimated position, then strip leading garbage
630
- remaining_text = text[estimated_end_of_echo:].lstrip(" \n:-")
631
-
632
- # If the remaining text is significantly shorter than original and still looks like it
633
- # might have started with the prompt, this is a good guess.
634
- # If this cut too much, it's problematic.
635
-
636
- # Let's try removing characters from the start of `text` one by one,
637
- # until the `SequenceMatcher` similarity with `prompt` drops below a threshold.
638
- # This is computationally more expensive but more accurate for removal.
639
-
640
- # A simpler, more direct implementation using the `SequenceMatcher` for *detection*
641
- # and then a careful string removal:
642
- # Remove the portion of `text` that corresponds to the `match.size` found by `SequenceMatcher`
643
- # from the beginning of `clean_text`, and then map that length back to `text`.
644
-
645
- # This is the most robust way to remove if `match.b == 0` (starts at beginning):
646
- # We have `clean_text[0 : match.size]` which is `clean_prompt[match.a : match.a + match.size]`
647
- # We need to find the equivalent `len` in the original `text`.
648
-
649
- # This is a known hard problem. Let's simplify.
650
- # If `SequenceMatcher` is confident (`match_ratio >= threshold`),
651
- # we will remove the actual `prompt` string (case-insensitive),
652
- # and then clean up.
653
-
654
- # Revert to a simpler 'startswith' for removal, but use the `SequenceMatcher` for the *condition*.
655
- # If the `SequenceMatcher` detected a match, it means `text` likely starts with `prompt`.
656
- # Then we can apply the `startswith` logic for removal.
657
-
658
- # Find the first occurrence of `clean_prompt` in `clean_text`
659
- idx_start = clean_text.find(clean_prompt)
660
- if idx_start == 0:
661
- # If the clean prompt is found at the start of the clean text,
662
- # remove the original prompt length from the original text.
663
- # This is a heuristic that works well if prompt is echoed cleanly.
664
- return text[len(prompt):].lstrip(" \n:-")
665
- else:
666
- # If the clean prompt itself isn't at the start, but SequenceMatcher
667
- # found a strong match (e.g., "prompt: <prompt content>" vs "Prompt: <prompt content>"),
668
- # we still want to remove it.
669
- # The `match.size` tells us how much of `clean_prompt` matched.
670
- # If `match.b == 0`, it means `clean_text` starts with a chunk of `clean_prompt`.
671
- # We can try to remove the *length* of `clean_prompt` from `text`.
672
- # This is a bit brute force but avoids complex mapping.
673
- return text[len(clean_prompt):].lstrip(" \n:-")
674
-
675
- # If no significant match at the beginning, return original text
676
  return text
677
 
678
 
@@ -703,7 +447,7 @@ def _init_state() -> None:
703
  "video_path": "",
704
  "model_input": DEFAULT_MODEL,
705
  "prompt": DEFAULT_PROMPT,
706
- "api_key": os.getenv("GOOGLE_API_KEY", ""), # Changed default to empty string for security
707
  "video_password": "",
708
  "compress_mb": 200,
709
  "busy": False,
@@ -738,7 +482,7 @@ def main() -> None:
738
  "Compress if > (MB)",
739
  min_value=10,
740
  max_value=2000,
741
- value=st.session_state.get("compress_mb", 200),
742
  step=10,
743
  key="compress_mb",
744
  )
@@ -888,5 +632,4 @@ def main() -> None:
888
  # Entry point
889
  # ----------------------------------------------------------------------
890
  if __name__ == "__main__":
891
- # No need to call _init_state() here – it is invoked inside main()
892
  main()
 
9
  # Imports
10
  # ----------------------------------------------------------------------
11
  import base64, hashlib, os, string, traceback
12
+ import time
13
  from pathlib import Path
14
  from difflib import SequenceMatcher
15
  from typing import Tuple, Optional
16
 
17
  import ffmpeg
18
+ # Changed from google.generativeai as genai to google.genai
19
+ import google.genai as genai
20
  import requests
21
  import streamlit as st
22
  import yt_dlp
23
+ # Removed snscrape.modules.twitter as sntwitter due to errors and user request
24
 
25
  # ----------------------------------------------------------------------
26
  # Constants & defaults
 
53
  name = Path(url.split("?")[0]).name.lower() # Remove query parameters before getting name
54
  if not name: # Fallback if URL doesn't have a clear file name (e.g., youtube.com/watch?v=...)
55
  name = "downloaded_video"
56
+ # Allow periods for extensions, but sanitize other punctuation (except periods)
57
  name = name.translate(str.maketrans("", "", string.punctuation.replace(".", ""))).replace(" ", "_")
58
  return name
59
 
 
153
  """
154
  # ---------- yt_dlp options ----------
155
  # Use a more specific template to avoid clashes and ensure proper naming
 
 
156
  tmpl = str(dst / "%(id)s.%(ext)s")
157
  ydl_opts = {
158
  "outtmpl": tmpl,
 
249
  # Always ensure the destination directory exists
250
  dst.mkdir(parents=True, exist_ok=True)
251
 
252
+ # Simple check for direct video file links (e.g., raw .mp4 link)
253
+ # Exclude common platforms that yt-dlp handles better even if they look like direct links
254
+ if url.lower().endswith(video_exts) and not any(platform in url for platform in ["youtube.com", "vimeo.com"]):
255
+ st.info(f"Attempting direct download for URL: {url}")
256
  return _download_direct(url, dst)
257
 
258
+ # Default to yt_dlp for all other cases (e.g., YouTube, Vimeo, generic pages that yt_dlp can parse)
259
+ st.info(f"Attempting download with yt-dlp for URL: {url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  return _download_with_yt_dlp(url, dst, password)
261
 
262
 
 
344
  return "\n".join(parts)
345
 
346
 
347
+ def _strip_prompt_echo(prompt: str, text: str, similarity_threshold: float = 0.68) -> str:
348
  """
349
  Strips the prompt from the beginning of the generated text if it appears
350
+ as an echo, using difflib.SequenceMatcher for robust matching.
351
 
352
  Args:
353
  prompt: The original prompt sent to the model.
354
  text: The generated text from the model.
355
+ similarity_threshold: The similarity ratio (0.0 to 1.0) required for a match.
356
+ A value of 0.68 means at least 68% of the prompt must be
357
+ present at the beginning of the text to be considered an echo.
358
 
359
  Returns:
360
  The text with the prompt echo removed, or the original text if no echo
 
363
  if not prompt or not text:
364
  return text
365
 
366
+ # Normalize both prompt and text for robust comparison (lowercase, single spaces)
367
  clean_prompt = " ".join(prompt.lower().split()).strip()
368
  clean_text = " ".join(text.lower().split()).strip()
369
 
370
+ # Avoid processing if clean_prompt is much larger than clean_text,
371
+ # or if either is empty after cleaning
372
+ if not clean_prompt or not clean_text or len(clean_prompt) > len(clean_text) * 2:
373
+ return text
374
+
375
+ # Use SequenceMatcher to find the longest matching block at the beginning
376
  matcher = SequenceMatcher(None, clean_prompt, clean_text)
377
+ # `match.b == 0` ensures the match starts at the very beginning of `clean_text`.
378
  match = matcher.find_longest_match(0, len(clean_prompt), 0, len(clean_text))
379
 
380
+ if match.b == 0 and match.size > 0: # If a match starts at the beginning of the generated text
381
+ # Calculate the ratio of the matched segment to the *entire* prompt length.
382
+ match_ratio = match.size / len(clean_prompt)
383
+
384
+ if match_ratio >= similarity_threshold:
385
+ # High confidence that the prompt (or a very similar version)
386
+ # is echoed at the beginning of the generated text.
387
+ # Now, attempt to remove the echoed part from the original `text`.
388
+
389
+ original_text_idx = 0
390
+ original_prompt_idx = 0
391
+
392
+ # Iterate through both original strings, attempting to match characters
393
+ # while being tolerant of leading whitespace and punctuation in the text.
394
+ while original_text_idx < len(text) and original_prompt_idx < len(prompt):
395
+ char_text = text[original_text_idx]
396
+ char_prompt = prompt[original_prompt_idx]
397
+
398
+ if char_text.lower() == char_prompt.lower():
399
+ # Characters match (case-insensitively), advance both pointers
400
+ original_text_idx += 1
401
+ original_prompt_idx += 1
402
+ elif char_text.isspace() or char_text in string.punctuation:
403
+ # Current char in text is whitespace or punctuation,
404
+ # and it's not matching the current prompt char.
405
+ # Assume it's leading noise from the model's output; consume it.
406
+ original_text_idx += 1
407
+ else:
408
+ # Found a significant mismatch that isn't just whitespace/punctuation
409
+ # or the prompt ended. Stop matching.
410
+ break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
 
412
+ # If a substantial portion of the prompt was "consumed" by this process,
413
+ # then we consider the prompt to have been echoed.
414
+ # Return the rest of the text, further stripping any residual leading
415
+ # whitespace/punctuation that the loop might have missed.
416
+ if original_prompt_idx / len(prompt) >= similarity_threshold:
417
+ return text[original_text_idx:].lstrip(" \n:-")
 
 
418
 
419
+ # If no significant match at the beginning, or threshold not met, return original text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  return text
421
 
422
 
 
447
  "video_path": "",
448
  "model_input": DEFAULT_MODEL,
449
  "prompt": DEFAULT_PROMPT,
450
+ "api_key": os.getenv("GOOGLE_API_KEY", ""),
451
  "video_password": "",
452
  "compress_mb": 200,
453
  "busy": False,
 
482
  "Compress if > (MB)",
483
  min_value=10,
484
  max_value=2000,
485
+ value=st.session_state["compress_mb"], # Simplified from .get()
486
  step=10,
487
  key="compress_mb",
488
  )
 
632
  # Entry point
633
  # ----------------------------------------------------------------------
634
  if __name__ == "__main__":
 
635
  main()