File size: 9,205 Bytes
d85c750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c335df4
 
d85c750
 
 
 
 
 
 
 
 
 
 
 
c335df4
 
d85c750
 
 
c335df4
 
 
 
 
d85c750
c335df4
 
 
d85c750
 
c335df4
 
 
 
 
 
 
 
 
 
d85c750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c335df4
 
d85c750
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c335df4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d85c750
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import feedparser
from typing import List
from datetime import datetime
from app.models import Article
import re

class RSSParser:
    """RSS feed parser for news sources"""
    
    async def parse_google_news(self, content: str, category: str) -> List[Article]:
        """Parse Google News RSS feed with advanced XML parsing"""
        try:
            articles = []
            
            # Extract items from XML using regex
            item_regex = r'<item>([\s\S]*?)</item>'
            matches = re.findall(item_regex, content)
            
            for item in matches[:20]:  # Limit to 20 articles
                title = self._extract_tag(item, 'title') or 'No title'
                link = self._extract_tag(item, 'link') or self._extract_tag(item, 'guid') or ''
                description = self._extract_tag(item, 'description') or self._extract_tag(item, 'content:encoded') or ''
                pub_date = self._extract_tag(item, 'pubDate') or self._extract_tag(item, 'published') or datetime.now().isoformat()
                creator = self._extract_tag(item, 'dc:creator') or self._extract_tag(item, 'author') or 'Google News'
                
                # Extract image from multiple sources
                image = self._extract_image_from_xml(item, description, category, title)
                
                # Extract source name from description (Google News format: <a href="...">Source</a>)
                source_match = re.search(r'<a[^>]*>([^<]+)</a>', description)
                article_source = source_match.group(1) if source_match else 'Google News'
                
                # Clean description (Google News RSS only contains links, not actual content)
                cleaned_description = self._clean_google_news_description(description)
                
                article = Article(
                    title=self._clean_html(title),
                    description=cleaned_description,
                    url=link,
                    image_url=image, # Corrected: image -> image_url
                    published_at=pub_date, # Corrected: publishedAt -> published_at
                    source=self._clean_html(article_source),
                    category=category
                )
                articles.append(article)
            
            return articles
        except Exception as e:
            print(f"Error parsing Google News: {e}")
            return []
    
    def _extract_image_from_xml(self, item: str, description: str, category: str, title: str) -> str:
        """Extract image from multiple XML sources with fallbacks"""
        # 1. Try media:content or media:thumbnail with namespace handling
        # Many feeds use media:content URL attribute directly
        media_match = re.search(r'<media:(content|thumbnail)[^>]*url="([^"]+)"', item)
        if media_match:
            return media_match.group(2)
            
        # 2. Try enclosure tag (standard RSS)
        enclosure_match = re.search(r'<enclosure[^>]*url="([^"]+)"', item)
        if enclosure_match:
            return enclosure_match.group(1)
        
        # 3. Try parsing <img> tag from description or content:encoded
        # Look for src attribute in img tags, supporting both single and double quotes
        img_match = re.search(r'<img[^>]+src=["\']([^"\']+)["\']', description)
        if img_match:
            return img_match.group(1)
            
        # 4. Try looking for og:image pattern if inside CDATA
        og_match = re.search(r'property=["\']og:image["\'][^>]*content=["\']([^"\']+)["\']', description)
        if og_match:
            return og_match.group(1)
        
        # 5. Return empty string to let Frontend handle the fallback
        # User requested: "if there is no image came while fetching then we banner our segmento pulse banner"
        # The frontend uses /placeholder-news.svg when image is empty
        return ""
    
    def _clean_google_news_description(self, description: str) -> str:
        """Clean Google News description - they typically only contain links, not actual content"""
        # Check if this is a Google News link-only description
        if 'news.google.com/rss/articles' in description:
            return ''  # No real content, just redirect links
        
        # Try to extract content after the link
        after_link_match = re.search(r'</a>([\s\S]*)', description)
        if after_link_match:
            extracted = self._clean_html(after_link_match.group(1))
            if len(extracted) > 30:
                return extracted[:200]
        
        # Fallback: clean entire description if meaningful
        full_clean = self._clean_html(description)
        if len(full_clean) > 30 and not full_clean.startswith('http'):
            return full_clean[:200]
        
        return ''
    
    def _extract_tag(self, xml: str, tag_name: str) -> str:
        """Extract XML tag content"""
        pattern = f'<{tag_name}[^>]*>([\\s\\S]*?)</{tag_name}>'
        match = re.search(pattern, xml, re.IGNORECASE)
        return match.group(1).strip() if match else ''
    
    def _clean_html(self, html: str) -> str:
        """Remove HTML tags and decode entities"""
        text = html
        
        # Remove CDATA
        text = re.sub(r'<!\[CDATA\[([\s\S]*?)\]\]>', r'\1', text)
        
        # Remove HTML tags (multiple passes for nested tags)
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'<[^>]*', '', text)
        text = re.sub(r'>', '', text)
        
        # Decode HTML entities
        entities = {
            '&nbsp;': ' ', '&amp;': '&', '&lt;': '<', '&gt;': '>',
            '&quot;': '"', '&#39;': "'", '&apos;': "'",
            '&hellip;': '...', '&mdash;': '—', '&ndash;': '–'
        }
        for entity, char in entities.items():
            text = text.replace(entity, char)
        
        # Remove numeric entities
        text = re.sub(r'&#\d+;', '', text)
        
        # Clean whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    async def parse_provider_rss(self, content: str, provider: str) -> List[Article]:
        """Parse cloud provider RSS feed"""
        try:
            feed = feedparser.parse(content)
            articles = []
            
            for entry in feed.entries[:20]:
                # Extract image
                image_url = self._extract_image_from_entry(entry)
                
                # Parse date
                published_at = self._parse_date(entry.get('published', ''))
                
                # Get description
                description = entry.get('summary', '')
                if description:
                    # Strip HTML tags
                    description = re.sub(r'<[^>]+>', '', description)
                    description = description[:200] + '...' if len(description) > 200 else description
                
                article = Article(
                    title=entry.get('title', ''),
                    description=description,
                    url=entry.get('link', ''),
                    image_url=image_url, # Corrected: image -> image_url
                    published_at=published_at, # Corrected: publishedAt -> published_at
                    source=provider.upper(),
                    category=f'cloud-{provider}'
                )
                articles.append(article)
            
            return articles
        except Exception as e:
            print(f"Error parsing provider RSS: {e}")
            return []
    
    def _extract_image_from_entry(self, entry) -> str:
        """Extract image URL from feed entry"""
        # Try media:content
        if hasattr(entry, 'media_content') and entry.media_content:
            return entry.media_content[0].get('url', '')
        
        # Try media:thumbnail
        if hasattr(entry, 'media_thumbnail') and entry.media_thumbnail:
            return entry.media_thumbnail[0].get('url', '')
        
        # Try enclosures
        if hasattr(entry, 'enclosures') and entry.enclosures:
            for enclosure in entry.enclosures:
                if enclosure.get('type', '').startswith('image'):
                    return enclosure.get('href', '')
        
        # Try HTML content/summary for <img> tags
        content = ''
        if hasattr(entry, 'content') and entry.content:
            content = entry.content[0].get('value', '')
        elif hasattr(entry, 'summary'):
            content = entry.summary
            
        if content:
            import re
            img_match = re.search(r'<img[^>]+src=["\']([^"\']+)["\']', content)
            if img_match:
                return img_match.group(1)
        
        # Default: Return empty to let Frontend use standard banner
        return ""
    
    def _parse_date(self, date_str: str) -> datetime:
        """Parse date string to datetime"""
        try:
            # feedparser usually provides a parsed date
            # but we'll handle string parsing as fallback
            from dateutil import parser
            return parser.parse(date_str)
        except:
            return datetime.now()