Spaces:

synaptyx
/

trends_spotter_LATEST

Runtime error

App Files Files Community

cryogenic22 commited on Feb 6, 2025

Commit

ebcd35b

verified ·

1 Parent(s): f06e91c

Update agents/collection_agent.py

Browse files

Files changed (1) hide show

agents/collection_agent.py +279 -84

agents/collection_agent.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import re
 import json
 import logging
@@ -7,19 +8,39 @@ import urllib.parse
 from bs4 import BeautifulSoup
 import streamlit as st
 from datetime import datetime, timedelta
-from typing import Any, Optional, List, Dict, Callable
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 def robust_json_parse(text: str) -> Optional[Any]:
     text = text.strip()
     try:
         return json.loads(text)
     except json.JSONDecodeError:
-        return None
 def convert_post_count_str_to_number(count_str: str) -> float:
     try:
         cleaned = count_str.replace(',', '')
         if cleaned and cleaned[-1].lower() in ['k', 'm', 'b']:
@@ -33,6 +54,12 @@ def convert_post_count_str_to_number(count_str: str) -> float:
         return 0.0
 def parse_trends_from_raw_text(text: str, platform: str) -> List[Dict]:
     trends = []
     blocks = re.split(r"### \*\*Trend:", text)
     for block in blocks[1:]:
@@ -90,40 +117,101 @@ class TrendCollectionAgent:
     async def collect_trends(self,
                              platforms: Optional[List[str]] = None,
                              progress_callback: Optional[Callable[[int, int, str], None]] = None
                             ) -> List[Dict]:
         if platforms is None:
             platforms = self.default_platforms
-        steps_per_platform = 2  # 1 for fetching, 1 for processing
-        total_steps = len(platforms) * steps_per_platform
         current_step = 0
-        combined_trends = []
         for platform in platforms:
-            current_step += 1
-            if progress_callback:
-                progress_callback(current_step, total_steps, f"Fetching {platform} data...")
             try:
-                query = f"What are the current top trending topics on {platform}? List them with descriptions and the current number of posts."
                 perplexity_response = self.perplexity_client.search(query)
-                perplexity_raw = str(perplexity_response)
-                self.add_debug_log("perplexity", f"Raw perplexity response for {platform}: {perplexity_raw[:300]}...")
-                parsed_trends = parse_trends_from_raw_text(perplexity_raw, platform)
-                combined_trends.extend(parsed_trends)
             except Exception as e:
-                self.add_debug_log("collection", f"Error collecting {platform} trends: {str(e)}", "error")
                 continue
-            current_step += 1
-            if progress_callback:
-                progress_callback(current_step, total_steps, f"Processing {platform} data...")
-            for trend in combined_trends:
-                if trend.get("platform", "").lower() == platform.lower() and not trend.get("url"):
-                    trend["url"] = await self.fetch_trend_url(trend)
-        return combined_trends
     async def fetch_trend_url(self, trend: Dict) -> str:
         """
-        Enhanced function to fetch relevant URLs for social media trends.
-        Uses multiple search queries and search engines with delays.
         """
         def is_valid_url(url: str, platform: str) -> bool:
             if not url:
@@ -203,6 +291,7 @@ class TrendCollectionAgent:
                         clean_result = clean_url(url)
                         self.add_debug_log("url_search", f"Found valid URL for '{trend_name}': {clean_result}")
                         return clean_result
                 all_urls = re.findall(r'href="(https?://[^"]+)"', html)
                 for url in all_urls:
                     if is_valid_url(url, platform):
@@ -215,84 +304,190 @@ class TrendCollectionAgent:
             self.add_debug_log("url_search", f"Critical error in fetch_trend_url: {str(e)}", "error")
             return trend.get("url", "")
-    async def get_trend_sentiment(self, trend: Dict) -> str:
-        """Analyze the sentiment of a trend."""
-        prompt = f"""You are an expert in social media sentiment analysis.
-Analyze the following trend and classify its overall sentiment as Positive, Negative, or Neutral.
-Trend details:
-Name: {trend.get('name')}
-Description: {trend.get('description')}
-Posts: {trend.get('number_of_posts', '0')}
-Platform: {trend.get('platform')}
-Return your answer as one word (Positive, Negative, or Neutral) followed by a brief explanation.
-"""
         try:
-            response = await asyncio.to_thread(
-                self.client_anthropic.messages.create,
-                model="claude-3-5-sonnet-20241022",
-                max_tokens=500,
-                temperature=0.3,
-                messages=[{"role": "user", "content": prompt}]
             )
-            content = response.content[0].text if isinstance(response.content, list) else response.content
-            return content.strip()
         except Exception as e:
-            self.add_debug_log("sentiment", f"Error analyzing sentiment: {str(e)}", "error")
-            return "Unknown"
-    async def _scrape_reader_trends(self, platform: str) -> List[Dict]:
-        collected_text = ""
-        urls = st.session_state.source_urls.get(platform)
-        if not urls:
-            urls = [self.default_reader_sources.get(platform)]
-        for url in urls:
-            if not url.startswith("https://r.jina.ai/"):
-                reader_url = "https://r.jina.ai/" + url
-            else:
-                reader_url = url
             try:
-                async with aiohttp.ClientSession() as session:
-                    async with session.get(reader_url, timeout=10) as response:
-                        content = await response.text() if response.status == 200 else ""
-                self.add_debug_log("reader", f"Fetched raw content from {reader_url} for {platform} (first 500 chars): {content[:500]}")
-                collected_text += "\n" + content
-            except Exception as e:
-                self.add_debug_log("reader", f"Error fetching {reader_url}: {str(e)}", "error")
                 continue
-        return parse_trends_from_raw_text(collected_text, platform)
-    async def get_bank_recommendations(self, trends: List[Dict]) -> str:
-        prompt = f"""You are an expert in digital marketing for financial institutions.
-Given the following trend data, identify which trends would be most suitable for a bank to leverage in its marketing strategy.
-Provide the names of the trends along with a brief reason for each selection.
-Trends:
-{json.dumps(trends, indent=2)}
-Return your response as plain text.
-"""
-        try:
-            response = await asyncio.to_thread(
-                self.client_anthropic.messages.create,
-                model="claude-3-5-sonnet-20241022",
-                max_tokens=1000,
-                temperature=0.3,
-                messages=[{"role": "user", "content": prompt}]
-            )
-            content = response.content[0].text if isinstance(response.content, list) else response.content
-            return content.strip()
-        except Exception as e:
-            self.add_debug_log("bank", f"Error retrieving bank recommendations: {str(e)}", "error")
-            return "No recommendations available"
 if __name__ == "__main__":
     import os
     from utils.api_clients import initialize_api_clients
     client_anthropic, client_openai, perplexity_client = initialize_api_clients()
     agent = TrendCollectionAgent(client_anthropic, perplexity_client)
     async def test_collect():
         trends = await agent.collect_trends(
             platforms=["TikTok", "Instagram"],
-            progress_callback=lambda current, total, msg: print(f"[{current}/{total}] {msg}")
         )
         print("Curated Trends:")
         print(json.dumps(trends, indent=2))
     asyncio.run(test_collect())

+from typing import Dict, List, Optional, Callable, Any
 import re
 import json
 import logging
 from bs4 import BeautifulSoup
 import streamlit as st
 from datetime import datetime, timedelta
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 def robust_json_parse(text: str) -> Optional[Any]:
     text = text.strip()
+    # If the text is wrapped in a TextBlock(...) call, extract the inner text.
+    if "TextBlock(" in text:
+        m = re.search(r"TextBlock\(citations=None,\s*text='(.*?)',\s*type='text'\)", text, re.DOTALL)
+        if m:
+            text = m.group(1)
     try:
         return json.loads(text)
     except json.JSONDecodeError:
+        # Try to extract a JSON array or object from the text
+        array_match = re.search(r'(\[.*\])', text, re.DOTALL)
+        if array_match:
+            try:
+                return json.loads(array_match.group(1))
+            except json.JSONDecodeError:
+                pass
+        object_match = re.search(r'(\{.*\})', text, re.DOTALL)
+        if object_match:
+            try:
+                return json.loads(object_match.group(1))
+            except json.JSONDecodeError:
+                pass
+    return None
 def convert_post_count_str_to_number(count_str: str) -> float:
+    """
+    Converts a post count string (e.g., '1.2K', '2,000,000', '200') to a numerical value.
+    """
     try:
         cleaned = count_str.replace(',', '')
         if cleaned and cleaned[-1].lower() in ['k', 'm', 'b']:
         return 0.0
 def parse_trends_from_raw_text(text: str, platform: str) -> List[Dict]:
+    """
+    Parses raw markdown text containing trends.
+    Assumes each trend block begins with "### **Trend:".
+    Extracts trend name, discovery date, trend recap (as description),
+    and the current posts value.
+    """
     trends = []
     blocks = re.split(r"### \*\*Trend:", text)
     for block in blocks[1:]:
     async def collect_trends(self,
                              platforms: Optional[List[str]] = None,
+                             time_range: int = 60,
                              progress_callback: Optional[Callable[[int, int, str], None]] = None
                             ) -> List[Dict]:
+        """
+        Collect trends from Perplexity and then process them.
+        """
         if platforms is None:
             platforms = self.default_platforms
+        raw_trends = []
+        total_steps = len(platforms) * 2  # 2 steps per platform
         current_step = 0
         for platform in platforms:
             try:
+                if progress_callback:
+                    progress_callback(current_step, total_steps, f"Analyzing {platform} trends...")
+                query = (
+                    f"What are the current top trending topics on {platform} in the last {time_range} days? "
+                    "List them with engagement metrics and descriptions."
+                )
                 perplexity_response = self.perplexity_client.search(query)
+                current_step += 1
+                if progress_callback:
+                    progress_callback(current_step, total_steps, f"Processing {platform} data...")
+                # Use the older curation prompt logic to extract trends.
+                curation_prompt = f"""Analyze this raw trend data and extract the key trends with their metrics.
+The output should be a JSON array. Each trend object must have the following keys:
+- name (string)
+- description (string)
+- engagement (integer; use 0 if not available)
+- url (string)
+- platform (string)
+- audio_url (string; if available, else an empty string)
+- number_of_posts (integer; if available, else 0)
+For Instagram trends: Only include genuine trending topics and exclude aggregated Instagram reels or generic collections.
+Raw data:
+{perplexity_response}
+Output only a valid JSON array.
+"""
+                try:
+                    response = await asyncio.to_thread(
+                        self.client_anthropic.messages.create,
+                        model="claude-3-5-sonnet-20241022",
+                        max_tokens=2000,
+                        temperature=0,
+                        messages=[{"role": "user", "content": curation_prompt}]
+                    )
+                    content = response.content[0].text if isinstance(response.content, list) else response.content
+                    content = content.strip()
+                    self.add_debug_log("curation", f"Raw curation response for {platform}: {content[:300]}...")
+                    platform_trends = None
+                    try:
+                        platform_trends = json.loads(content)
+                    except json.JSONDecodeError:
+                        json_match = re.search(r'\[.*\]', content, re.DOTALL)
+                        if json_match:
+                            try:
+                                platform_trends = json.loads(json_match.group())
+                            except json.JSONDecodeError:
+                                self.add_debug_log("curation", f"Failed to parse JSON for {platform}", "error")
+                    if platform_trends and isinstance(platform_trends, list):
+                        raw_trends.extend(platform_trends)
+                    else:
+                        self.add_debug_log("curation", f"Invalid trends format for {platform}", "error")
+                except Exception as e:
+                    self.add_debug_log("curation", f"Error processing {platform} trends: {str(e)}", "error")
+                    continue
+                current_step += 1
+                if progress_callback:
+                    progress_callback(current_step, total_steps, f"Completed {platform} analysis")
             except Exception as e:
+                self.add_debug_log("collection", f"Error processing platform {platform}: {str(e)}", "error")
                 continue
+        # For each collected trend, update the URL if missing.
+        for trend in raw_trends:
+            if not trend.get("url"):
+                trend["url"] = await self.fetch_trend_url(trend)
+        # (Optional) You can filter or sort trends here.
+        return raw_trends
     async def fetch_trend_url(self, trend: Dict) -> str:
         """
+        Fetch a URL for the given trend using multiple search queries and engines.
         """
         def is_valid_url(url: str, platform: str) -> bool:
             if not url:
                         clean_result = clean_url(url)
                         self.add_debug_log("url_search", f"Found valid URL for '{trend_name}': {clean_result}")
                         return clean_result
+                # Fallback: use regex to extract all URLs
                 all_urls = re.findall(r'href="(https?://[^"]+)"', html)
                 for url in all_urls:
                     if is_valid_url(url, platform):
             self.add_debug_log("url_search", f"Critical error in fetch_trend_url: {str(e)}", "error")
             return trend.get("url", "")
+    def _curate_trends(self, trends: List[Dict]) -> List[Dict]:
+        prepared_trends = self._prepare_for_json(trends)
+        prompt = f"""You are an expert in digital trends. Consolidate the following raw trend data into a curated JSON array.
+Each trend object must have the following keys:
+- name (string): a clearly defined trend name
+- description (string): a concise description of the trend
+- engagement (integer): the engagement value (or 0 if unavailable)
+- url (string): the source URL
+- platform (string): e.g., TikTok or Instagram
+- audio_url (string): if available, else an empty string
+- number_of_posts (integer): if available, else 0
+Remove duplicates and only include trends that are relevant and published in the last 60 days.
+Raw data:
+{json.dumps(prepared_trends, indent=2)}
+Output only a valid JSON array and nothing else."""
+        self.add_debug_log("curation", "Sending curation prompt to Claude LLM.")
         try:
+            response = asyncio.run(
+                asyncio.to_thread(
+                    self.client_anthropic.messages.create,
+                    model="claude-3-5-sonnet-20241022",
+                    max_tokens=3000,
+                    messages=[{"role": "user", "content": prompt}]
+                )
             )
+            content = str(response.content) if not isinstance(response.content, dict) else json.dumps(response.content)
+            self.add_debug_log("curation_raw", f"Raw curation response:\n{content}")
+            parsed = robust_json_parse(content)
+            if parsed is None:
+                self.add_debug_log("curation", "Robust JSON parsing failed for curation response.", "error")
+                parsed = self._parse_trends_from_text(content, "unknown")
+            curated_trends = parsed if isinstance(parsed, list) else []
+            return curated_trends
         except Exception as e:
+            self.add_debug_log("curation", f"Error during trend curation: {e}", "error")
+            return trends
+    def _prepare_for_json(self, trends: List[Dict]) -> List[Dict]:
+        prepared = []
+        for trend in trends:
+            new_trend = {}
+            for key, value in trend.items():
+                if isinstance(value, datetime):
+                    new_trend[key] = value.isoformat()
+                else:
+                    new_trend[key] = value
+            prepared.append(new_trend)
+        return prepared
+    def _extract_posts_count(self, text: str) -> int:
+        match = re.search(r'(\d+(?:\.\d+)?)\s*(?:posts|post)', text, re.IGNORECASE)
+        if match:
             try:
+                return int(float(match.group(1)))
+            except Exception:
+                return 0
+        return 0
+    def _parse_trends_from_text(self, text: str, platform: str) -> List[Dict]:
+        trends = []
+        current_trend = None
+        if isinstance(text, (list, dict)):
+            text = str(text)
+        published_time = None
+        pub_match = re.search(r'Published Time:\s*([\d\-T:+]+)', text)
+        if pub_match:
+            try:
+                published_time = datetime.fromisoformat(pub_match.group(1))
+            except Exception:
+                published_time = None
+        lines = text.split('\n')
+        trend_indicators = [
+            r'trend:?\s*"([^"]+)"',
+            r'trend:?\s*(.+?(?=\s*[\n\-•]|$))',
+            r'"([\w\s&]+)"\s*(?:trend|challenge)',
+            r'#(\w+)',
+        ]
+        def is_valid_trend_name(name: str) -> bool:
+            invalid_patterns = [
+                r'metrics$', r'statistics$', r'engagement$', r'trends$', r'analytics$',
+                r'strategy$', r'content$', r'marketing$', r'features?$', r'format$',
+                r'^type', r'^category', r'^format', r'^strategy', r'^content'
+            ]
+            name = name.lower().strip()
+            if any(re.search(pattern, name, re.IGNORECASE) for pattern in invalid_patterns):
+                return False
+            if len(name) < 3 or len(name.split()) > 8:
+                return False
+            if not re.search(r'[a-zA-Z]', name):
+                return False
+            return True
+        def extract_trend_name(line: str) -> Optional[str]:
+            for pattern in trend_indicators:
+                match = re.search(pattern, line, re.IGNORECASE)
+                if match:
+                    name = match.group(1).strip()
+                    if is_valid_trend_name(name):
+                        return name
+            return None
+        for line in lines:
+            line = line.strip()
+            if not line:
                 continue
+            trend_name = extract_trend_name(line)
+            if trend_name:
+                if current_trend:
+                    trends.append(current_trend)
+                description = re.sub(f'{trend_name}|trend:|challenge:', '', line, flags=re.IGNORECASE).strip()
+                current_trend = {
+                    'name': trend_name,
+                    'description': description,
+                    'platform': platform,
+                    'discovery_date': published_time if published_time is not None else datetime.now(),
+                    'engagement': self._extract_numeric_value(line) or 0,
+                    'demographics': [],
+                    'regions': [],
+                    'content_type': self._determine_content_type({'platform': platform, 'description': description}),
+                    'audio_url': "",
+                    'number_of_posts': 0
+                }
+            elif current_trend:
+                lower_line = line.lower()
+                if 'engagement' in lower_line or any(word in lower_line for word in ['views', 'likes']):
+                    current_trend['engagement'] = self._extract_numeric_value(line)
+                elif any(word in lower_line for word in ['demographic', 'audience']):
+                    current_trend['demographics'].append(line)
+                elif any(word in lower_line for word in ['region', 'country']):
+                    current_trend['regions'].append(line)
+                elif 'post' in lower_line:
+                    posts_count = self._extract_posts_count(line)
+                    current_trend['number_of_posts'] = max(current_trend.get('number_of_posts', 0), posts_count)
+        if current_trend:
+            trends.append(current_trend)
+        validated_trends = [trend for trend in trends if is_valid_trend_name(trend['name'])]
+        return validated_trends
+    def _extract_numeric_value(self, text: str) -> int:
+        matches = re.findall(r'(\d+(?:\.\d+)?)\s*([kmb])?(?:\s+(?:views|likes|shares|engagement))?', text.lower())
+        highest_value = 0
+        for value_str, suffix in matches:
+            try:
+                value = float(value_str)
+                if suffix:
+                    multiplier = {'k': 1000, 'm': 1000000, 'b': 1000000000}.get(suffix.lower(), 1)
+                    value *= multiplier
+                highest_value = max(highest_value, int(value))
+            except (ValueError, TypeError):
+                continue
+        return highest_value
+    def _determine_content_type(self, trend: Dict) -> str:
+        platform = trend.get('platform', '')
+        description = trend.get('description', '').lower()
+        if platform == 'TikTok':
+            if any(word in description for word in ['challenge', 'dance']):
+                return 'challenge'
+            if any(word in description for word in ['sound', 'audio', 'song']):
+                return 'sound'
+            return 'video'
+        elif platform == 'Instagram':
+            if any(word in description for word in ['reel', 'video']):
+                return 'video'
+            if any(word in description for word in ['photo', 'image', 'picture']):
+                return 'image'
+            if any(word in description for word in ['challenge', 'dance']):
+                return 'challenge'
+            return 'mixed'
+        return 'mixed'
 if __name__ == "__main__":
     import os
     from utils.api_clients import initialize_api_clients
     client_anthropic, client_openai, perplexity_client = initialize_api_clients()
     agent = TrendCollectionAgent(client_anthropic, perplexity_client)
     async def test_collect():
         trends = await agent.collect_trends(
             platforms=["TikTok", "Instagram"],
+            progress_callback=lambda step, total, msg: print(f"[{step}/{total}] {msg}")
         )
         print("Curated Trends:")
         print(json.dumps(trends, indent=2))
     asyncio.run(test_collect())