Video-Analysis-Tool

Sleeping

App Files Files Community

Hug0endob commited on 10 days ago

Commit

7c962d2

verified ·

1 Parent(s): 68431e8

Update streamlit_app.py

Browse files

Files changed (1) hide show

streamlit_app.py +64 -321

streamlit_app.py CHANGED Viewed

@@ -9,17 +9,18 @@ Video‑analysis Streamlit app
 # Imports
 # ----------------------------------------------------------------------
 import base64, hashlib, os, string, traceback
-import time # Added for fallback filename in _download_with_yt_dlp
 from pathlib import Path
 from difflib import SequenceMatcher
 from typing import Tuple, Optional
 import ffmpeg
-import google.generativeai as genai
 import requests
 import streamlit as st
 import yt_dlp
-import snscrape.modules.twitter as sntwitter
 # ----------------------------------------------------------------------
 # Constants & defaults
@@ -52,7 +53,7 @@ def _sanitize_filename(url: str) -> str:
     name = Path(url.split("?")[0]).name.lower() # Remove query parameters before getting name
     if not name: # Fallback if URL doesn't have a clear file name (e.g., youtube.com/watch?v=...)
         name = "downloaded_video"
-    # Allow periods for extensions, but sanitize other punctuation
     name = name.translate(str.maketrans("", "", string.punctuation.replace(".", ""))).replace(" ", "_")
     return name
@@ -152,8 +153,6 @@ def _download_with_yt_dlp(url: str, dst: Path, password: str = "") -> Path:
     """
     # ---------- yt_dlp options ----------
     # Use a more specific template to avoid clashes and ensure proper naming
-    # %(title)s is often good, but can be long, so combining with %(id)s is safer.
-    # We'll sanitize this name later.
     tmpl = str(dst / "%(id)s.%(ext)s")
     ydl_opts = {
         "outtmpl": tmpl,
@@ -250,45 +249,14 @@ def download_video(url: str, dst: Path, password: str = "") -> Path:
     # Always ensure the destination directory exists
     dst.mkdir(parents=True, exist_ok=True)
-    # Simple check for direct video file links
-    if url.lower().endswith(video_exts) and not any(platform in url for platform in ["youtube.com", "twitter.com", "vimeo.com"]):
-        # Use direct download for simple file links if not a known platform yt_dlp handles better
         return _download_direct(url, dst)
-    # Handle Twitter URLs specifically
-    if "twitter.com" in url and "/status/" in url:
-        tweet_id = url.split("/")[-1].split("?")[0]
-        try:
-            # Use the newer snscrape directly (get_items is an iterator)
-            scraper = sntwitter.TwitterTweetScraper(tweet_id)
-            found_video_url = None
-            for i, tweet in enumerate(scraper.get_items()):
-                if i > 0: # Only need to check the first tweet for its media
-                    break
-                for m in getattr(tweet, "media", []):
-                    if getattr(m, "video_url", None):
-                        found_video_url = m.video_url
-                        break
-                if found_video_url:
-                    break
-                # Also check general URLs in the tweet for direct video links
-                for u in getattr(tweet, "urls", []):
-                    if u.expandedUrl and u.expandedUrl.lower().endswith(video_exts):
-                        found_video_url = u.expandedUrl
-                        break
-                if found_video_url:
-                    break
-            if found_video_url:
-                st.info(f"Found video URL in tweet: {found_video_url}")
-                return download_video(found_video_url, dst) # Recurse with the actual video URL
-            else:
-                raise RuntimeError("No direct video or video URL found in the tweet content.")
-        except Exception as e:
-            st.warning(f"Failed to scrape Twitter for video, trying yt-dlp: {e}")
-            # Fall through to yt_dlp if scraping fails
-    # Default to yt_dlp for most other cases
     return _download_with_yt_dlp(url, dst, password)
@@ -376,17 +344,17 @@ def generate_report(
     return "\n".join(parts)
-def _strip_prompt_echo(prompt: str, text: str, threshold: float = 0.68) -> str:
     """
     Strips the prompt from the beginning of the generated text if it appears
-    as an echo, using difflib.SequenceMatcher for more robust matching.
     Args:
         prompt: The original prompt sent to the model.
         text: The generated text from the model.
-        threshold: The similarity ratio (0.0 to 1.0) required for a match.
-                   A value of 0.68 means at least 68% of the prompt must be
-                   present at the beginning of the text to be considered an echo.
     Returns:
         The text with the prompt echo removed, or the original text if no echo
@@ -395,284 +363,60 @@ def _strip_prompt_echo(prompt: str, text: str, threshold: float = 0.68) -> str:
     if not prompt or not text:
         return text
-    # Normalize both prompt and text for comparison: lowercase, single spaces
     clean_prompt = " ".join(prompt.lower().split()).strip()
     clean_text = " ".join(text.lower().split()).strip()
-    # Find the longest matching block at the beginning of the text
     matcher = SequenceMatcher(None, clean_prompt, clean_text)
     match = matcher.find_longest_match(0, len(clean_prompt), 0, len(clean_text))
-    # Check if a significant portion of the prompt matches the beginning of the text
-    # s1[match.a : match.a + match.size] is the part of clean_prompt that matches
-    # s2[match.b : match.b + match.size] is the part of clean_text that matches
-    # We are interested if clean_text starts with a match to clean_prompt.
-    if match.b == 0 and match.size > 0:
-        matched_prompt_segment = clean_prompt[match.a : match.a + match.size]
-        # Calculate ratio of matched segment to the *entire* prompt
-        # This is more accurate than matcher.ratio() which compares full strings
-        match_ratio = len(matched_prompt_segment) / len(clean_prompt) if len(clean_prompt) > 0 else 0
-        if match_ratio >= threshold:
-            # Determine the actual length in the original 'text' to remove
-            # This is tricky because of original casing and whitespace.
-            # A simple approach is to remove the prompt part from the original `text`
-            # by finding where the *cleaned* matched segment ends in the *cleaned* text,
-            # then using that position in the original `text`.
-            # Simpler: if we match a large part of the prompt at the beginning of clean_text,
-            # assume the original prompt appears at the start of original text and try to strip it.
-            # This might not be perfectly robust to whitespace differences, but better than nothing.
-            # Find the position where the matched prompt segment ends in the original `text`
-            # This is still heuristic, but tries to remove up to the full prompt length if it's there
-            # Instead of trying to find exact index after cleaning and then mapping back,
-            # which is complex, we can simply remove the prompt and any leading delimiters
-            # if a high enough similarity is found at the start.
-            # Try to find the prompt in the original text, case-insensitively, and remove
-            lower_text_original = text.lower()
-            lower_prompt_original = prompt.lower()
-            # Find the first occurrence of the prompt (or a significant part of it)
-            # This simple `find` might still be an issue with variations.
-            # Let's revert to a slightly more sophisticated startswith check for the original logic.
-            # If the original `text` actually starts with `prompt` (case-insensitive, after stripping),
-            # then remove it. This avoids issues with `SequenceMatcher` finding a match in the middle.
-            # Re-evaluate based on finding the prompt within the text itself for removal.
-            # We use `clean_text.find(clean_prompt_part_that_matched)` to find the start in clean_text
-            # and then infer the end.
-            # A simpler, more robust way for removal: If we are confident a prompt echo exists,
-            # attempt to remove the prompt itself and any leading punctuation/whitespace.
-            # The `SequenceMatcher` gives us confidence.
-            # Find the end position of the matched prompt segment within `clean_text`
-            # This approach is still a bit brittle due to varying whitespace/punc
-            # between `clean_text` and `text`.
-            # Let's use the match.size directly to infer removal from original `text`.
-            # If `clean_text` starts with a chunk of `clean_prompt` of `match.size` length,
-            # we want to remove the corresponding part from `text`.
-            # The most direct way is to remove the prompt itself from the beginning of `text`
-            # and then strip leading delimiters.
-            # A safer method for stripping after confirming a match:
-            # 1. Take the text.
-            # 2. Convert a prefix of the text (e.g., first `len(prompt) + 50` chars) to lower case.
-            # 3. Compare with lower case prompt using SequenceMatcher.
-            # 4. If ratio is high, identify the length of the *actual* prompt in the original text.
-            # This is hard.
-            # Alternative: If a high ratio is found for the start of `clean_text` matching `clean_prompt`,
-            # then assume the prompt is echoed. We will remove the *original* prompt,
-            # and then strip any leading non-alphanumeric characters.
-            # The original logic of `_strip_prompt_echo` was:
-            # `if lower_text.startswith(clean_prompt): return text[len(prompt):].lstrip(" \n:-")`
-            # This relied on an exact match of the prompt's *cleaned* version with the start of the *cleaned* text.
-            # `SequenceMatcher` improves the "startswith" check.
-            # If `SequenceMatcher` indicates a strong match at the beginning (`match.b == 0`),
-            # we remove the prompt text (case-insensitive) from the start of the *original* text.
-            # Try to find the prompt (case-insensitive) at the beginning of the text
-            prompt_lower = prompt.lower()
-            text_lower_prefix = text[:len(prompt) + 50].lower() # Check a reasonable prefix
-            # This finds the start of the prompt within the text_lower_prefix
-            # Using find can be problematic if text has leading junk.
-            # Instead, just remove the prompt itself if we deem it echoed.
-            # Given the high confidence from SequenceMatcher (`match_ratio >= threshold`),
-            # we can attempt to remove a string equivalent to the prompt from the beginning of `text`.
-            # Find the index of the prompt's normalized version in the normalized text.
-            # This is still not perfect for original `text` whitespace.
-            # Let's refine the removal: remove the prompt string itself and then strip.
-            # This is still susceptible to minor leading variations.
-            # Re-thinking to be robust: If `clean_text` matches `clean_prompt` up to `match.size`
-            # at its beginning (match.b == 0), then we should remove `text` up to the length
-            # that corresponds to `match.size` in `clean_text`.
-            # This means we need to map `match.size` characters of `clean_text` back to `text`.
-            # This is complex. A simpler, somewhat heuristic approach:
-            # If `clean_prompt` matches the beginning of `clean_text` (match.b == 0)
-            # and the match is long enough (`match_ratio >= threshold`),
-            # then it is likely the prompt was echoed.
-            # We want to remove *at least* the prompt from the start, plus any leading junk.
-            # The original logic (`text[len(prompt):].lstrip(" \n:-")`) is good for removal *given* a match.
-            # The `SequenceMatcher` provides a better "given a match" condition.
-            # Find the actual end of the matching part in the original `text`
-            # This is the tricky part. A heuristic:
-            # Iterate through `text` and `prompt` simultaneously, skipping whitespace/punctuation.
-            # Count how many characters of `text` correspond to the matched `prompt` characters.
-            # Let's try to find the full (or most of) prompt within `text` (case insensitive)
-            # and remove that.
-            # Find the actual segment of the prompt that matched in the *original* `prompt` string
-            matched_segment_in_prompt_original_case = prompt[match.a : match.a + match.size]
-            # Find the index of this segment in the original `text`, if it's at the beginning
-            idx_in_text = text.lower().find(matched_segment_in_prompt_original_case.lower())
-            if idx_in_text == 0: # If the matched segment appears at the very beginning of the original text
-                # Try to remove the actual prompt from the text.
-                # This could be slightly off if the model added characters *inside* the prompt echo.
-                # The safest bet: if we have a high confidence match, strip the *entire* prompt,
-                # then strip leading noise.
-                # Assume the model output the prompt, potentially with minor changes.
-                # Remove a portion of `text` that is roughly `len(prompt)` long,
-                # then clean up leading characters.
-                # A robust heuristic for removal after `SequenceMatcher` confirms echo:
-                # Remove characters from the start of `text` until we reach a point
-                # where the remaining `text` no longer significantly matches `prompt`.
-                # Given match_ratio is high, we can be aggressive.
-                # The simplest removal is `text[len(prompt):]`.
-                # Then apply the lstrip.
-                # Determine the end index in `text` that corresponds to the end of the `clean_prompt` match
-                end_idx_in_clean_text = match.size
-                # Convert the `clean_text` end index back to an original `text` index
-                # This is still problematic.
-                # Let's stick to the simplest removal if the `SequenceMatcher` gives confidence.
-                # Remove characters up to the prompt's length, then strip leading non-alphanumeric.
-                # This might cut off too much or too little if the model's echo deviates
-                # significantly in length.
-                # A more refined approach:
-                # If clean_prompt is "abc" and clean_text is "abc def", match.size=3.
-                # We need to remove 3 characters from `text` and then lstrip.
-                # If clean_prompt is "abc" and clean_text is "ABC DEF", match.size=3.
-                # We need to remove 3 characters from `text` and then lstrip.
-                # The `match.size` gives the length of the longest *common* subsequence.
-                # This does not directly translate to the length of the "echoed prompt" in `text`.
-                # `SequenceMatcher` is good for *detection*, but mapping `match.size` back to actual
-                # string indices for removal is complex for strings with different whitespace.
-                # Let's go with a pragmatic approach: if `SequenceMatcher` says there's a strong echo at the start,
-                # we will remove the exact `prompt` string (case-insensitively) if it's there,
-                # and then strip leading noise. This is still safer than `text[match.size:]` as
-                # `match.size` is often smaller than the prompt's actual length.
-                # Try to remove the actual prompt from the beginning of the text,
-                # allowing for whitespace and punctuation before it.
-                # Find the actual (case-insensitive) start of the prompt within the text
-                # by searching for the normalized prompt.
-                # If SequenceMatcher gives high confidence, attempt to remove `len(prompt)`
-                # characters from the beginning of `text`, then strip.
-                # This is a heuristic, but often works well.
-                # Given the match, remove a prefix of `text` corresponding to `len(prompt)`
-                # and then strip leading punctuation/whitespace.
-                # This might cut off more or less than the actual echoed prompt if there are
-                # length differences in the echo.
-                # A robust way to remove the "matched portion" without exact index mapping:
-                # If `clean_prompt` matches `clean_text` strongly at the beginning,
-                # it means `clean_text` starts with `clean_prompt` (or a very similar version).
-                # We can remove `prompt` + any leading garbage characters.
-                # Let's try removing characters until the remaining text's start is no longer
-                # strongly similar to the prompt.
-                # A simpler, direct approach if `SequenceMatcher` confirms a strong match:
-                # Find where the `clean_prompt` *would end* in `clean_text` if it were there.
-                # This is what `difflib` is for: `SequenceMatcher` (a,b) identifies differences.
-                # What we want is the index in `text` where the "echo" ends.
-                # The prompt is usually "Prompt: <actual prompt>".
-                # If the model echoes the prompt, it usually starts with "Prompt: <actual prompt>".
-                # So we can remove `prompt` and then strip leading characters.
-                # The `SequenceMatcher` logic means we found a high similarity.
-                # Try finding the exact (case-insensitive) prompt in the text
-                lower_text = text.lower()
-                lower_prompt = prompt.lower()
-                # Find the first occurrence of the lowercased prompt in the lowercased text
-                # If it's at the very beginning (index 0), then remove it and strip.
-                if lower_text.startswith(lower_prompt):
-                    return text[len(prompt):].lstrip(" \n:-")
-                else:
-                    # If the exact match doesn't work, but SequenceMatcher was confident,
-                    # it means there were minor variations.
-                    # We can try to remove text up to `match.size` from the start of the *original* text
-                    # and then strip. This is still risky.
-                    # Instead, if the `SequenceMatcher` confidence is high, and `clean_text` starts
-                    # with the matched part, simply remove a fixed length from `text`
-                    # that is roughly the length of the prompt, and then strip.
-                    # This is the most practical.
-                    # Estimate the end position of the echoed prompt in the original text
-                    # based on the length of the clean prompt.
-                    # This is a heuristic.
-                    estimated_end_of_echo = len(prompt)
-                    # Remove characters up to this estimated position, then strip leading garbage
-                    remaining_text = text[estimated_end_of_echo:].lstrip(" \n:-")
-                    # If the remaining text is significantly shorter than original and still looks like it
-                    # might have started with the prompt, this is a good guess.
-                    # If this cut too much, it's problematic.
-                    # Let's try removing characters from the start of `text` one by one,
-                    # until the `SequenceMatcher` similarity with `prompt` drops below a threshold.
-                    # This is computationally more expensive but more accurate for removal.
-                    # A simpler, more direct implementation using the `SequenceMatcher` for *detection*
-                    # and then a careful string removal:
-                    # Remove the portion of `text` that corresponds to the `match.size` found by `SequenceMatcher`
-                    # from the beginning of `clean_text`, and then map that length back to `text`.
-                    # This is the most robust way to remove if `match.b == 0` (starts at beginning):
-                    # We have `clean_text[0 : match.size]` which is `clean_prompt[match.a : match.a + match.size]`
-                    # We need to find the equivalent `len` in the original `text`.
-                    # This is a known hard problem. Let's simplify.
-                    # If `SequenceMatcher` is confident (`match_ratio >= threshold`),
-                    # we will remove the actual `prompt` string (case-insensitive),
-                    # and then clean up.
-                    # Revert to a simpler 'startswith' for removal, but use the `SequenceMatcher` for the *condition*.
-                    # If the `SequenceMatcher` detected a match, it means `text` likely starts with `prompt`.
-                    # Then we can apply the `startswith` logic for removal.
-                    # Find the first occurrence of `clean_prompt` in `clean_text`
-                    idx_start = clean_text.find(clean_prompt)
-                    if idx_start == 0:
-                        # If the clean prompt is found at the start of the clean text,
-                        # remove the original prompt length from the original text.
-                        # This is a heuristic that works well if prompt is echoed cleanly.
-                        return text[len(prompt):].lstrip(" \n:-")
-                    else:
-                        # If the clean prompt itself isn't at the start, but SequenceMatcher
-                        # found a strong match (e.g., "prompt: <prompt content>" vs "Prompt: <prompt content>"),
-                        # we still want to remove it.
-                        # The `match.size` tells us how much of `clean_prompt` matched.
-                        # If `match.b == 0`, it means `clean_text` starts with a chunk of `clean_prompt`.
-                        # We can try to remove the *length* of `clean_prompt` from `text`.
-                        # This is a bit brute force but avoids complex mapping.
-                        return text[len(clean_prompt):].lstrip(" \n:-")
-    # If no significant match at the beginning, return original text
     return text
@@ -703,7 +447,7 @@ def _init_state() -> None:
         "video_path": "",
         "model_input": DEFAULT_MODEL,
         "prompt": DEFAULT_PROMPT,
-        "api_key": os.getenv("GOOGLE_API_KEY", ""), # Changed default to empty string for security
         "video_password": "",
         "compress_mb": 200,
         "busy": False,
@@ -738,7 +482,7 @@ def main() -> None:
         "Compress if > (MB)",
         min_value=10,
         max_value=2000,
-        value=st.session_state.get("compress_mb", 200),
         step=10,
         key="compress_mb",
     )
@@ -888,5 +632,4 @@ def main() -> None:
 # Entry point
 # ----------------------------------------------------------------------
 if __name__ == "__main__":
-    # No need to call _init_state() here – it is invoked inside main()
     main()

 # Imports
 # ----------------------------------------------------------------------
 import base64, hashlib, os, string, traceback
+import time
 from pathlib import Path
 from difflib import SequenceMatcher
 from typing import Tuple, Optional
 import ffmpeg
+# Changed from google.generativeai as genai to google.genai
+import google.genai as genai
 import requests
 import streamlit as st
 import yt_dlp
+# Removed snscrape.modules.twitter as sntwitter due to errors and user request
 # ----------------------------------------------------------------------
 # Constants & defaults
     name = Path(url.split("?")[0]).name.lower() # Remove query parameters before getting name
     if not name: # Fallback if URL doesn't have a clear file name (e.g., youtube.com/watch?v=...)
         name = "downloaded_video"
+    # Allow periods for extensions, but sanitize other punctuation (except periods)
     name = name.translate(str.maketrans("", "", string.punctuation.replace(".", ""))).replace(" ", "_")
     return name
     """
     # ---------- yt_dlp options ----------
     # Use a more specific template to avoid clashes and ensure proper naming
     tmpl = str(dst / "%(id)s.%(ext)s")
     ydl_opts = {
         "outtmpl": tmpl,
     # Always ensure the destination directory exists
     dst.mkdir(parents=True, exist_ok=True)
+    # Simple check for direct video file links (e.g., raw .mp4 link)
+    # Exclude common platforms that yt-dlp handles better even if they look like direct links
+    if url.lower().endswith(video_exts) and not any(platform in url for platform in ["youtube.com", "vimeo.com"]):
+        st.info(f"Attempting direct download for URL: {url}")
         return _download_direct(url, dst)
+    # Default to yt_dlp for all other cases (e.g., YouTube, Vimeo, generic pages that yt_dlp can parse)
+    st.info(f"Attempting download with yt-dlp for URL: {url}")
     return _download_with_yt_dlp(url, dst, password)
     return "\n".join(parts)
+def _strip_prompt_echo(prompt: str, text: str, similarity_threshold: float = 0.68) -> str:
     """
     Strips the prompt from the beginning of the generated text if it appears
+    as an echo, using difflib.SequenceMatcher for robust matching.
     Args:
         prompt: The original prompt sent to the model.
         text: The generated text from the model.
+        similarity_threshold: The similarity ratio (0.0 to 1.0) required for a match.
+                              A value of 0.68 means at least 68% of the prompt must be
+                              present at the beginning of the text to be considered an echo.
     Returns:
         The text with the prompt echo removed, or the original text if no echo
     if not prompt or not text:
         return text
+    # Normalize both prompt and text for robust comparison (lowercase, single spaces)
     clean_prompt = " ".join(prompt.lower().split()).strip()
     clean_text = " ".join(text.lower().split()).strip()
+    # Avoid processing if clean_prompt is much larger than clean_text,
+    # or if either is empty after cleaning
+    if not clean_prompt or not clean_text or len(clean_prompt) > len(clean_text) * 2:
+        return text
+    # Use SequenceMatcher to find the longest matching block at the beginning
     matcher = SequenceMatcher(None, clean_prompt, clean_text)
+    # `match.b == 0` ensures the match starts at the very beginning of `clean_text`.
     match = matcher.find_longest_match(0, len(clean_prompt), 0, len(clean_text))
+    if match.b == 0 and match.size > 0: # If a match starts at the beginning of the generated text
+        # Calculate the ratio of the matched segment to the *entire* prompt length.
+        match_ratio = match.size / len(clean_prompt)
+        if match_ratio >= similarity_threshold:
+            # High confidence that the prompt (or a very similar version)
+            # is echoed at the beginning of the generated text.
+            # Now, attempt to remove the echoed part from the original `text`.
+            original_text_idx = 0
+            original_prompt_idx = 0
+            # Iterate through both original strings, attempting to match characters
+            # while being tolerant of leading whitespace and punctuation in the text.
+            while original_text_idx < len(text) and original_prompt_idx < len(prompt):
+                char_text = text[original_text_idx]
+                char_prompt = prompt[original_prompt_idx]
+                if char_text.lower() == char_prompt.lower():
+                    # Characters match (case-insensitively), advance both pointers
+                    original_text_idx += 1
+                    original_prompt_idx += 1
+                elif char_text.isspace() or char_text in string.punctuation:
+                    # Current char in text is whitespace or punctuation,
+                    # and it's not matching the current prompt char.
+                    # Assume it's leading noise from the model's output; consume it.
+                    original_text_idx += 1
+                else:
+                    # Found a significant mismatch that isn't just whitespace/punctuation
+                    # or the prompt ended. Stop matching.
+                    break
+            # If a substantial portion of the prompt was "consumed" by this process,
+            # then we consider the prompt to have been echoed.
+            # Return the rest of the text, further stripping any residual leading
+            # whitespace/punctuation that the loop might have missed.
+            if original_prompt_idx / len(prompt) >= similarity_threshold:
+                return text[original_text_idx:].lstrip(" \n:-")
+    # If no significant match at the beginning, or threshold not met, return original text
     return text
         "video_path": "",
         "model_input": DEFAULT_MODEL,
         "prompt": DEFAULT_PROMPT,
+        "api_key": os.getenv("GOOGLE_API_KEY", ""),
         "video_password": "",
         "compress_mb": 200,
         "busy": False,
         "Compress if > (MB)",
         min_value=10,
         max_value=2000,
+        value=st.session_state["compress_mb"], # Simplified from .get()
         step=10,
         key="compress_mb",
     )
 # Entry point
 # ----------------------------------------------------------------------
 if __name__ == "__main__":
     main()