SHAFI
feat: production-ready backend with audio fixes and CORS update
c335df4
import feedparser
from typing import List
from datetime import datetime
from app.models import Article
import re
class RSSParser:
"""RSS feed parser for news sources"""
async def parse_google_news(self, content: str, category: str) -> List[Article]:
"""Parse Google News RSS feed with advanced XML parsing"""
try:
articles = []
# Extract items from XML using regex
item_regex = r'<item>([\s\S]*?)</item>'
matches = re.findall(item_regex, content)
for item in matches[:20]: # Limit to 20 articles
title = self._extract_tag(item, 'title') or 'No title'
link = self._extract_tag(item, 'link') or self._extract_tag(item, 'guid') or ''
description = self._extract_tag(item, 'description') or self._extract_tag(item, 'content:encoded') or ''
pub_date = self._extract_tag(item, 'pubDate') or self._extract_tag(item, 'published') or datetime.now().isoformat()
creator = self._extract_tag(item, 'dc:creator') or self._extract_tag(item, 'author') or 'Google News'
# Extract image from multiple sources
image = self._extract_image_from_xml(item, description, category, title)
# Extract source name from description (Google News format: <a href="...">Source</a>)
source_match = re.search(r'<a[^>]*>([^<]+)</a>', description)
article_source = source_match.group(1) if source_match else 'Google News'
# Clean description (Google News RSS only contains links, not actual content)
cleaned_description = self._clean_google_news_description(description)
article = Article(
title=self._clean_html(title),
description=cleaned_description,
url=link,
image_url=image, # Corrected: image -> image_url
published_at=pub_date, # Corrected: publishedAt -> published_at
source=self._clean_html(article_source),
category=category
)
articles.append(article)
return articles
except Exception as e:
print(f"Error parsing Google News: {e}")
return []
def _extract_image_from_xml(self, item: str, description: str, category: str, title: str) -> str:
"""Extract image from multiple XML sources with fallbacks"""
# 1. Try media:content or media:thumbnail with namespace handling
# Many feeds use media:content URL attribute directly
media_match = re.search(r'<media:(content|thumbnail)[^>]*url="([^"]+)"', item)
if media_match:
return media_match.group(2)
# 2. Try enclosure tag (standard RSS)
enclosure_match = re.search(r'<enclosure[^>]*url="([^"]+)"', item)
if enclosure_match:
return enclosure_match.group(1)
# 3. Try parsing <img> tag from description or content:encoded
# Look for src attribute in img tags, supporting both single and double quotes
img_match = re.search(r'<img[^>]+src=["\']([^"\']+)["\']', description)
if img_match:
return img_match.group(1)
# 4. Try looking for og:image pattern if inside CDATA
og_match = re.search(r'property=["\']og:image["\'][^>]*content=["\']([^"\']+)["\']', description)
if og_match:
return og_match.group(1)
# 5. Return empty string to let Frontend handle the fallback
# User requested: "if there is no image came while fetching then we banner our segmento pulse banner"
# The frontend uses /placeholder-news.svg when image is empty
return ""
def _clean_google_news_description(self, description: str) -> str:
"""Clean Google News description - they typically only contain links, not actual content"""
# Check if this is a Google News link-only description
if 'news.google.com/rss/articles' in description:
return '' # No real content, just redirect links
# Try to extract content after the link
after_link_match = re.search(r'</a>([\s\S]*)', description)
if after_link_match:
extracted = self._clean_html(after_link_match.group(1))
if len(extracted) > 30:
return extracted[:200]
# Fallback: clean entire description if meaningful
full_clean = self._clean_html(description)
if len(full_clean) > 30 and not full_clean.startswith('http'):
return full_clean[:200]
return ''
def _extract_tag(self, xml: str, tag_name: str) -> str:
"""Extract XML tag content"""
pattern = f'<{tag_name}[^>]*>([\\s\\S]*?)</{tag_name}>'
match = re.search(pattern, xml, re.IGNORECASE)
return match.group(1).strip() if match else ''
def _clean_html(self, html: str) -> str:
"""Remove HTML tags and decode entities"""
text = html
# Remove CDATA
text = re.sub(r'<!\[CDATA\[([\s\S]*?)\]\]>', r'\1', text)
# Remove HTML tags (multiple passes for nested tags)
text = re.sub(r'<[^>]+>', '', text)
text = re.sub(r'<[^>]*', '', text)
text = re.sub(r'>', '', text)
# Decode HTML entities
entities = {
'&nbsp;': ' ', '&amp;': '&', '&lt;': '<', '&gt;': '>',
'&quot;': '"', '&#39;': "'", '&apos;': "'",
'&hellip;': '...', '&mdash;': '—', '&ndash;': '–'
}
for entity, char in entities.items():
text = text.replace(entity, char)
# Remove numeric entities
text = re.sub(r'&#\d+;', '', text)
# Clean whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
async def parse_provider_rss(self, content: str, provider: str) -> List[Article]:
"""Parse cloud provider RSS feed"""
try:
feed = feedparser.parse(content)
articles = []
for entry in feed.entries[:20]:
# Extract image
image_url = self._extract_image_from_entry(entry)
# Parse date
published_at = self._parse_date(entry.get('published', ''))
# Get description
description = entry.get('summary', '')
if description:
# Strip HTML tags
description = re.sub(r'<[^>]+>', '', description)
description = description[:200] + '...' if len(description) > 200 else description
article = Article(
title=entry.get('title', ''),
description=description,
url=entry.get('link', ''),
image_url=image_url, # Corrected: image -> image_url
published_at=published_at, # Corrected: publishedAt -> published_at
source=provider.upper(),
category=f'cloud-{provider}'
)
articles.append(article)
return articles
except Exception as e:
print(f"Error parsing provider RSS: {e}")
return []
def _extract_image_from_entry(self, entry) -> str:
"""Extract image URL from feed entry"""
# Try media:content
if hasattr(entry, 'media_content') and entry.media_content:
return entry.media_content[0].get('url', '')
# Try media:thumbnail
if hasattr(entry, 'media_thumbnail') and entry.media_thumbnail:
return entry.media_thumbnail[0].get('url', '')
# Try enclosures
if hasattr(entry, 'enclosures') and entry.enclosures:
for enclosure in entry.enclosures:
if enclosure.get('type', '').startswith('image'):
return enclosure.get('href', '')
# Try HTML content/summary for <img> tags
content = ''
if hasattr(entry, 'content') and entry.content:
content = entry.content[0].get('value', '')
elif hasattr(entry, 'summary'):
content = entry.summary
if content:
import re
img_match = re.search(r'<img[^>]+src=["\']([^"\']+)["\']', content)
if img_match:
return img_match.group(1)
# Default: Return empty to let Frontend use standard banner
return ""
def _parse_date(self, date_str: str) -> datetime:
"""Parse date string to datetime"""
try:
# feedparser usually provides a parsed date
# but we'll handle string parsing as fallback
from dateutil import parser
return parser.parse(date_str)
except:
return datetime.now()