from flask import Flask, render_template, request, redirect, url_for, jsonify from flask_caching import Cache import aiohttp import asyncio from bs4 import BeautifulSoup, NavigableString #from playwright.async_api import async_playwright from datetime import datetime, timedelta, timezone import os import calendar import praw import re #from dotenv import load_dotenv #load_dotenv() API_KEY = os.environ.get('YT_API_KEY') REDDIT_CLIENT_ID = os.environ.get('REDDIT_CLIENT_ID') REDDIT_CLIENT_SECRET = os.environ.get('REDDIT_API_KEY') REDDIT_USER_AGENT = "user age by u/Hemanth" app = Flask(__name__) cache = Cache(app, config={'CACHE_TYPE': 'simple'}) YOUTUBE_API_URL = 'https://www.googleapis.com/youtube/v3/playlistItems' PLAYLIST_API_URL = 'https://www.googleapis.com/youtube/v3/playlists' WATCHED_VIDEOS_FILE = 'watched_videos.txt' PLAYLISTS_FILE = 'playlists.txt' LISTENED_EPISODES_FILE = 'listened_episodes.txt' async def fetch_videos_from_playlist(session, playlist_id): params = { 'part': 'snippet', 'playlistId': playlist_id, 'maxResults': 50, 'key': API_KEY } try: async with session.get(YOUTUBE_API_URL, params=params) as response: if response.status != 200: print(f"Failed to fetch data for playlist {playlist_id}: {response.status} {await response.text()}") return [] data = await response.json() return data.get('items', []) except Exception as e: print(f"Error in fetch_videos_from_playlist for {playlist_id}: {e}") return [] async def fetch_playlist_title(session, playlist_id): params = { 'part': 'snippet', 'id': playlist_id, 'key': API_KEY, } try: async with session.get(PLAYLIST_API_URL, params=params) as response: if response.status == 200: data = await response.json() if data.get('items'): return { 'id': data['items'][0]['id'], 'title': data['items'][0]['snippet']['title'], 'channel_title': data['items'][0]['snippet']['channelTitle'] } else: print(f"Failed to fetch playlist title for {playlist_id}: {response.status} {await response.text()}") except Exception as e: print(f"Error in fetch_playlist_title for {playlist_id}: {e}") return None reddit = praw.Reddit( client_id=REDDIT_CLIENT_ID, client_secret=REDDIT_CLIENT_SECRET, user_agent=REDDIT_USER_AGENT ) SUBREDDITS = ['UPSC','SideProject', 'datascience','explainlikeimfive','Krishnamurti','ycombinator','OpenAI','programming','AskReddit', 'worldnews', 'politics'] @app.route('/get_posts', methods=['POST']) def get_posts(): data = request.json selected_subreddits = data.get('subreddits', []) sort_method = data.get('sort', 'hot') limit = int(data.get('limit', 10)) all_posts = [] for subreddit_name in selected_subreddits: try: subreddit = reddit.subreddit(subreddit_name) if sort_method == 'top': posts_iterable = subreddit.top(limit=limit) elif sort_method == 'new': posts_iterable = subreddit.new(limit=limit) else: posts_iterable = subreddit.hot(limit=limit) for post in posts_iterable: all_posts.append({ 'subreddit': subreddit_name, 'title': post.title, 'url': post.url, 'author': post.author.name if post.author else '[deleted]', 'score': post.score, 'num_comments': post.num_comments, 'created_utc': post.created_utc, 'selftext': post.selftext[:300] + '...' if len(post.selftext) > 300 else post.selftext, 'link': f"https://reddit.com{post.permalink}", 'flair': post.link_flair_text if post.link_flair_text else 'No flair', 'nsfw': post.over_18, 'spoiler': post.spoiler }) except Exception as e: print(f"Error fetching posts from subreddit {subreddit_name}: {e}") return jsonify(all_posts) def filter_videos_by_date(videos, days=5): recent_videos = [] if not videos: return [] cutoff_date = datetime.now(timezone.utc) - timedelta(days=days) for video in videos: try: publish_date = datetime.strptime(video['snippet']['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc) if publish_date >= cutoff_date: recent_videos.append({ 'videoId': video['snippet']['resourceId']['videoId'], 'title': video['snippet']['title'], 'url': f"https://www.youtube.com/watch?v={video['snippet']['resourceId']['videoId']}", 'publishedAt': publish_date }) except (KeyError, TypeError, ValueError) as e: print(f"Error processing video data: {video.get('snippet', {}).get('title', 'Unknown Video')}. Error: {e}") continue return recent_videos def load_watched_videos(): if os.path.exists(WATCHED_VIDEOS_FILE): with open(WATCHED_VIDEOS_FILE, 'r') as file: return {line.strip() for line in file} return set() def save_watched_videos(video_ids): try: with open(WATCHED_VIDEOS_FILE, 'a') as file: for video_id in video_ids: file.write(f"{video_id}\n") except IOError as e: print(f"Error saving watched videos: {e}") def filter_unseen_videos(videos): if not videos: return [] watched_videos = load_watched_videos() return [video for video in videos if video['videoId'] not in watched_videos] async def load_playlists(session): playlists_data = [] if os.path.exists(PLAYLISTS_FILE): try: with open(PLAYLISTS_FILE, 'r') as file: playlist_ids = [line.strip() for line in file] tasks = [fetch_playlist_title(session, playlist_id) for playlist_id in playlist_ids] playlist_details_results = await asyncio.gather(*tasks, return_exceptions=True) for detail in playlist_details_results: if isinstance(detail, Exception): print(f"Error fetching a playlist detail: {detail}") elif detail: playlists_data.append(detail) except Exception as e: print(f"Error loading playlists from file or fetching details: {e}") return playlists_data def add_playlist(playlist_id): try: with open(PLAYLISTS_FILE, 'a') as file: file.write(f"{playlist_id}\n") except IOError as e: print(f"Error adding playlist to file: {e}") async def scrape_air_content(url, title_filter): episodes = [] try: async with aiohttp.ClientSession() as session: async with session.get(url) as response: if response.status != 200: print(f"Failed to fetch data from {url}: {response.status}") return [] content = await response.text() soup = BeautifulSoup(content, "html.parser") table = soup.find('table', class_='table') if not table: print(f"Table not found on {url}") return [] rows = table.find_all('tr')[1:] for row in rows: cols = row.find_all('td') if len(cols) < 4: continue title = cols[0].text.strip() date_str = cols[1].text.strip() time_str = cols[2].text.strip() audio_tag = cols[3].find('audio') audio_src = None if audio_tag: source_tag = audio_tag.find('source') if source_tag and 'src' in source_tag.attrs: audio_src = source_tag['src'] if not audio_src: continue try: date_obj = datetime.strptime(f"{date_str} {time_str}", '%d %b %Y %H:%M') except ValueError: print(f"Could not parse date for AIR episode: {date_str} {time_str}") continue if title in title_filter: episodes.append({ 'title': title, 'date': date_obj, 'audio_link': audio_src }) except Exception as e: print(f"Error scraping AIR content from {url} for titles {title_filter}: {e}") return episodes async def scrape_air_spotlight(): return await scrape_air_content("https://www.newsonair.gov.in/listen-broadcast-category/daily-broadcast/", ["Spotlight"]) async def scrape_air_insight(): return await scrape_air_content("https://www.newsonair.gov.in/listen-broadcast-category/weekly-broadcast/", ["Insight", "Insights"]) async def scrape_air_economy(): return await scrape_air_content("https://www.newsonair.gov.in/listen-broadcast-category/weekly-broadcast/", ["Money Talk"]) async def scrape_current_affairs_air(): return await scrape_air_content("https://www.newsonair.gov.in/listen-broadcast-category/weekly-broadcast/", ["Current Affairs"]) def filter_recent_episodes(episodes, days=3): if not episodes: return [] cutoff_date = datetime.now() - timedelta(days=days) return [episode for episode in episodes if episode['date'] >= cutoff_date] def load_listened_episodes(): if os.path.exists(LISTENED_EPISODES_FILE): with open(LISTENED_EPISODES_FILE, 'r') as file: return {line.strip() for line in file} return set() def save_listened_episodes(episode_links): try: with open(LISTENED_EPISODES_FILE, 'a') as file: for link in episode_links: file.write(f"{link}\n") except IOError as e: print(f"Error saving listened episodes: {e}") def filter_unheard_episodes(episodes): if not episodes: return [] listened_episodes = load_listened_episodes() return [episode for episode in episodes if episode.get('audio_link') not in listened_episodes] async def scrape_pib_asp_net(url, ministry=None, year=None, month=None, day=None): results = [] try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Content-Type': 'application/x-www-form-urlencoded', 'Origin': 'https://www.pib.gov.in', 'Referer': url } async with aiohttp.ClientSession(headers=headers) as session: async with session.get(url, ssl=False, timeout=30) as response: if response.status != 200: return [] html = await response.text() soup = BeautifulSoup(html, "html.parser") form_data = {} for input_tag in soup.find_all('input'): if input_tag.get('name'): form_data[input_tag.get('name')] = input_tag.get('value', '') ministry_val = ministry if ministry and ministry != '0' else '0' year_val = year if year else '2024' month_val = month if month else '0' day_val = day if day else '0' overrides = { '__EVENTTARGET': 'ctl00$ContentPlaceHolder1$ddlYear', '__EVENTARGUMENT': '', 'ctl00$ContentPlaceHolder1$ddlMinistry': ministry_val, 'ctl00$ContentPlaceHolder1$ddlYear': year_val, 'ctl00$ContentPlaceHolder1$ddlMonth': month_val, 'ctl00$ContentPlaceHolder1$ddlday': day_val, 'ctl00$ContentPlaceHolder1$ddlSector': '0', } form_data.update(overrides) async with session.post(url, data=form_data, ssl=False, timeout=30) as post_response: if post_response.status != 200: return [] post_content = await post_response.text() post_soup = BeautifulSoup(post_content, "html.parser") content_area = post_soup.find('div', class_='content-area') if content_area: for li in content_area.find_all('li'): link_tag = li.find('a') date_span = li.find('span', class_='publishdatesmall') if link_tag and date_span: href = link_tag.get('href', '') if href and not href.startswith('http'): if href.startswith('/'): href = f"https://www.pib.gov.in{href}" else: href = f"https://www.pib.gov.in/{href}" results.append({ 'title': link_tag.text.strip(), 'url': href, 'date': date_span.text.replace('Posted on:', '').strip() }) except Exception as e: print(f"PIB scraping error: {e}") return results async def scrape_pib(ministry=None, year=None, month=None, day=None): url = "https://www.pib.gov.in/ViewBackgrounder.aspx?MenuId=51®=3&lang=1" data = await scrape_pib_asp_net(url, ministry, year, month, day) if data ==[]: url1 = "https://www.pib.gov.in/ViewBackgrounder.aspx?MenuId=51" data = await scrape_pib_asp_net(url1, ministry, year, month, day) return {'Backgrounders': data} async def scrape_pib_facts(ministry=None, year=None, month=None, day=None): url = "https://www.pib.gov.in/AllFactsheet.aspx?MenuId=12®=3&lang=1" data = await scrape_pib_asp_net(url, ministry, year, month, day) if data ==[]: url1 = "https://www.pib.gov.in/ViewBackgrounder.aspx?MenuId=51" data = await scrape_pib_asp_net(url1, ministry, year, month, day) return {'Backgrounders': data} TH_url = "https://learningcorner.epaper.thehindu.com/articles" @app.route('/') @cache.cached(timeout=300) async def index(): playlist_data = [] spotlight_episodes, insight_episodes, economy_episodes, current_episodes_air = [], [], [], [] indian_express_articles_data, orf_articles_data, sansad_tv_summaries_data = [], [], [] pib_backgrounders_result, pib_facts_result, forum_ca_result = {}, {}, [] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' } async with aiohttp.ClientSession(headers=headers) as session: try: loaded_playlists = await load_playlists(session) for playlist_item in loaded_playlists: try: videos = await fetch_videos_from_playlist(session, playlist_item['id']) recent_videos = filter_videos_by_date(videos, days=5) unseen_videos = filter_unseen_videos(recent_videos) playlist_data.append({ 'id': playlist_item['id'], 'title': playlist_item['title'], 'unseen_count': len(unseen_videos), 'channel': playlist_item['channel_title'] }) except Exception as e: print(f"Error processing playlist {playlist_item.get('id', 'N/A')}: {e}") playlist_data.sort(key=lambda x: x['unseen_count'], reverse=True) except Exception as e: print(f"Error loading or processing playlists: {e}") tasks_to_run = [ scrape_air_spotlight(), scrape_air_insight(), scrape_air_economy(), scrape_current_affairs_air(), scrape_indian_express_articles(), scrape_orf_articles(), scrape_AIR_sansad_tv_summaries_Iasgyan(), scrape_pib(), scrape_pib_facts(), scrape_forumias(), scrape_insights_articles() ] forum_ca = await scrape_forumias_combined() results = await asyncio.gather(*tasks_to_run, return_exceptions=True) if not isinstance(results[0], Exception): spotlight_episodes = results[0] else: print(f"Error scraping AIR Spotlight: {results[0]}") if not isinstance(results[1], Exception): insight_episodes = results[1] else: print(f"Error scraping AIR Insight: {results[1]}") if not isinstance(results[2], Exception): economy_episodes = results[2] else: print(f"Error scraping AIR Economy: {results[2]}") if not isinstance(results[3], Exception): current_episodes_air = results[3] else: print(f"Error scraping AIR Current Affairs: {results[3]}") if not isinstance(results[4], Exception): indian_express_articles_data = results[4] else: print(f"Error scraping Indian Express: {results[4]}") if not isinstance(results[5], Exception): orf_articles_data = results[5] else: print(f"Error scraping ORF articles: {results[5]}") if not isinstance(results[6], Exception): sansad_tv_summaries_data = results[6] else: print(f"Error scraping Sansad TV Summaries (IASGyan): {results[6]}") if not isinstance(results[7], Exception): pib_backgrounders_result = results[7] else: print(f"Error scraping PIB Backgrounders: {results[7]}") if not isinstance(results[8], Exception): pib_facts_result = results[8] else: print(f"Error scraping PIB Facts: {results[8]}") if not isinstance(results[9], Exception): forum_ca_result = results[9] else: print(f"Error scraping ForumIAS CA: {results[9]}") if not isinstance(results[9], Exception): insights_articles_result = results[10] else: print(f"Error scraping ForumIAS CA: {results[10]}") spotlight_unheard = filter_unheard_episodes(filter_recent_episodes(spotlight_episodes or [], days=5)) insight_unheard = filter_unheard_episodes(filter_recent_episodes(insight_episodes or [], days=5)) economy_unheard = filter_unheard_episodes(filter_recent_episodes(economy_episodes or [], days=5)) current_unheard_air = filter_unheard_episodes(filter_recent_episodes(current_episodes_air or [], days=10)) return render_template('index.html', playlists=playlist_data, spotlight_unheard_count=len(spotlight_unheard), insight_unheard_count=len(insight_unheard), economy_unheard_count=len(economy_unheard), current_unheard_count=len(current_unheard_air), sansad_tv_summaries=sansad_tv_summaries_data or [], pib_backgrounders=pib_backgrounders_result or {}, subreddits=SUBREDDITS, articles=indian_express_articles_data or [], orfarticles=orf_articles_data or [], pibfacts=pib_facts_result or {}, insightarticles=insights_articles_result or [], forum_ca=forum_ca ) @app.route('/unseen_videos/') async def unseen_videos(playlist_id): unseen_vids = [] try: async with aiohttp.ClientSession() as session: videos = await fetch_videos_from_playlist(session, playlist_id) recent_videos = filter_videos_by_date(videos, days=5) unseen_vids = filter_unseen_videos(recent_videos) except Exception as e: print(f"Error in /unseen_videos/{playlist_id}: {e}") return render_template('unseen_videos.html', videos=unseen_vids, playlist_id=playlist_id) @app.route('/pib') async def pibscrap(): ministry = request.args.get('ministry','0') year = request.args.get('year','2024') month = request.args.get('month','0') day = request.args.get('day','0') pib_data = await scrape_pib(ministry=ministry, year=year, month=month, day=day) return render_template('pib.html', pib_backgrounders=pib_data or {}) @app.route('/pib_facts') async def pibscrapfacts(): ministry = request.args.get('ministry', '0') year = request.args.get('year', '2024') month = request.args.get('month', '0') day = request.args.get('day', '0') pib_data = await scrape_pib_facts(ministry=ministry, year=year, month=month, day=day) return render_template('pib_facts.html', pib_backgrounders=pib_data or {}) @app.route('/mark_watched', methods=['POST']) def mark_watched(): video_ids = request.form.getlist('video_ids') save_watched_videos(video_ids) playlist_id = request.form.get('playlist_id') if playlist_id: return redirect(url_for('unseen_videos', playlist_id=playlist_id)) return redirect(url_for('index')) @app.route('/add_playlist', methods=['GET', 'POST']) def add_playlist_route(): if request.method == 'POST': playlist_id = request.form.get('playlist_id', '').strip() if playlist_id: add_playlist(playlist_id) return redirect(url_for('index')) return render_template('add_playlist.html') async def _render_air_episodes_page(scrape_function, days_filter, template_name='spotlight.html'): episodes_data = [] try: raw_episodes = await scrape_function() recent_episodes_data = filter_recent_episodes(raw_episodes or [], days=days_filter) episodes_data = filter_unheard_episodes(recent_episodes_data or []) except Exception as e: print(f"Error in AIR episodes route for {scrape_function.__name__}: {e}") return render_template(template_name, episodes=episodes_data) @app.route('/spotlight') async def spotlight(): return await _render_air_episodes_page(scrape_air_spotlight, 5) @app.route('/Insight') async def insight(): return await _render_air_episodes_page(scrape_air_insight, 5) @app.route('/aireconomy') async def aireconomy(): return await _render_air_episodes_page(scrape_air_economy, 5) @app.route('/aircurrentaffairs') async def airCA(): return await _render_air_episodes_page(scrape_current_affairs_air, 10) @app.route('/mark_listened', methods=['POST']) def mark_listened(): episode_links = request.form.getlist('episode_links') save_listened_episodes(episode_links) return redirect(request.referrer or url_for('index')) BASE_URL_MEA = "https://www.mea.gov.in/bilateral-documents.htm" async def fetch_page_mea(session, url): # Renamed try: async with session.get(url, timeout=20) as response: if response.status != 200: print(f"Failed to fetch MEA page {url}: {response.status}") return None return await response.text() except Exception as e: print(f"Error fetching MEA page {url}: {e}") return None async def parse_page_mea(content, days_ago_cutoff_date): documents = [] continue_scraping = True if not content: return documents, False soup = BeautifulSoup(content, "html.parser") item_list = soup.find('ul', class_='commonListing') if item_list: for item in item_list.find_all('li'): title_link = item.find('a', class_='searchContent') date_container = item.find('span', class_='date') if title_link and date_container: title = title_link.text.strip() doc_url = title_link['href'] if not doc_url.startswith('http'): doc_url = f"https://www.mea.gov.in{doc_url}" if doc_url.startswith('/') else f"https://www.mea.gov.in/{doc_url}" date_str_raw = date_container.text.strip() try: date_obj = datetime.strptime(date_str_raw, "%B %d, %Y").replace(tzinfo=timezone.utc) if date_obj >= days_ago_cutoff_date: documents.append({ 'title': title, 'url': doc_url, 'date': date_obj.strftime("%B %d, %Y") }) else: continue_scraping = False break except ValueError: print(f"Error parsing MEA date: {date_str_raw} for title '{title}'") else: print("MEA commonListing not found.") continue_scraping = False return documents, continue_scraping def get_next_page_url_mea(content): if not content: return None soup = BeautifulSoup(content, "html.parser") next_link = soup.find('a', class_='next') if next_link and 'href' in next_link.attrs: href = next_link['href'] if href.startswith('http'): return href elif href.startswith('/'): return f"https://www.mea.gov.in{href}" else: return f"https://www.mea.gov.in/bilateral-documents/{href}" return None async def scrape_bilateral_documents(): all_documents = [] try: async with aiohttp.ClientSession() as session: current_url = f"{BASE_URL_MEA}?53/Bilateral/Multilateral_Documents" days_ago_cutoff_date = (datetime.now(timezone.utc) - timedelta(days=90)).replace(hour=0, minute=0, second=0, microsecond=0) page_count = 0 max_pages = 10 while current_url and page_count < max_pages: page_count += 1 print(f"Scraping MEA page {page_count}: {current_url}") content = await fetch_page_mea(session, current_url) if not content: break documents_on_page, continue_scraping = await parse_page_mea(content, days_ago_cutoff_date) all_documents.extend(documents_on_page) if not continue_scraping: print("Stopping MEA scraping based on date or parsing issue.") break current_url = get_next_page_url_mea(content) if not current_url: print("No next page found for MEA.") break await asyncio.sleep(1) except Exception as e: print(f"Error during MEA bilateral documents scraping: {e}") return all_documents @app.route('/MEAsite') @cache.cached(timeout=3600) async def bilateral_documents(): documents_data = await scrape_bilateral_documents() return render_template('bilateral_documents.html', documents=documents_data or []) async def scrape_prs_india(): cards_data = [] try: url = "https://prsindia.org" async with aiohttp.ClientSession() as session: async with session.get(url, timeout=20) as response: if response.status != 200: print(f"Failed to fetch PRS India data: {response.status}") return [] content = await response.text() soup = BeautifulSoup(content, "html.parser") right_banner = soup.find('div', class_='right-banner') if right_banner: for item in right_banner.find_all(['div','section'], class_=re.compile(r"col-\w*-6|card-item-class")): image_tag = item.find('img') link_tag = item.find('a') title_tag = item.find(['h3','h4','h5']) if link_tag and title_tag: img_src = None if image_tag and 'src' in image_tag.attrs: img_src = image_tag['src'] if not img_src.startswith('http'): img_src = url + img_src if img_src.startswith('/') else url + '/' + img_src link_href = link_tag['href'] if not link_href.startswith('http'): link_href = url + link_href if link_href.startswith('/') else url + '/' + link_href cards_data.append({ 'title': title_tag.text.strip(), 'image_url': img_src, 'link_url': link_href }) else: print("PRS India: right-banner not found.") except Exception as e: print(f"Error scraping PRS India: {e}") return cards_data @app.route('/prsindia') @cache.cached(timeout=3600) async def prs_india(): scraped_cards = await scrape_prs_india() return render_template('prsindia.html', cards=scraped_cards or []) async def scrape_prs_bills(session, search_keyword=None, year=None, status=None): bills_data = [] try: base_url = "https://prsindia.org/billtrack/category/billtrack" params = {} if search_keyword: params['BillActsBillsParliamentSearch[title]'] = search_keyword if status: params['BillActsBillsParliamentSearch[bill_status_id]'] = status if year: params['BillActsBillsParliamentSearch[date_of_introduction]'] = year async with session.get(base_url, params=params, timeout=20) as response: if response.status != 200: print(f"Failed to fetch PRS India bills data: {response.status}") return [] content = await response.text() soup = BeautifulSoup(content, "html.parser") for row in soup.find_all('div', class_='views-row'): title_div = row.find('div', class_='views-field-title-field') status_div = row.find('div', class_='views-field-field-bill-status') if title_div and status_div: title_tag = title_div.find('h3', class_='cate') if title_tag and title_tag.a: bill_url = title_tag.a['href'] if not bill_url.startswith('http'): bill_url = f"https://prsindia.org{bill_url}" status_span = status_div.find('span') status_text = status_span.text.strip() if status_span else "Unknown" bills_data.append({ 'title': title_tag.a.text.strip(), 'url': bill_url, 'status': status_text }) except Exception as e: print(f"Error scraping PRS India bills: {e}") return bills_data @app.route('/prsindia_bills') async def prs_india_bills(): search_keyword = request.args.get('search', '') year = request.args.get('year', str(datetime.now().year)) status = request.args.get('status', '') async with aiohttp.ClientSession() as session: bills = await scrape_prs_bills(session, search_keyword, year, status) return render_template('prsindia_bills.html', bills=bills, search_keyword=search_keyword, year=year, status=status) async def scrape_current_affairs_iasgyan(): current_affairs_data = [] try: url = "https://www.iasgyan.in/daily-current-affairs" async with aiohttp.ClientSession() as session: async with session.get(url, timeout=20) as response: if response.status != 200: print(f"Failed to fetch IASGyan Current Affairs data: {response.status}") return [] content = await response.text() soup = BeautifulSoup(content, "html.parser") cutoff_date_iasgyan = datetime.now() - timedelta(days=6) for article_block in soup.find_all('div', class_='shadow mt-4 rounded-2'): title_tag_ias = article_block.find('h3', class_='fw-semibold text-white m-0 fs-5') article_links_list = article_block.find_all('a', class_='w-100') if title_tag_ias and article_links_list: date_str_match = re.search(r'–\s*(.+)$', title_tag_ias.text.strip()) if not date_str_match: continue date_str_cleaned = re.sub(r'(\d+)(st|nd|rd|th|TH|ND|RD|ST)\s*', r'\1 ', date_str_match.group(1).strip()) date_str_cleaned = date_str_cleaned.replace(" ", " ") try: article_date_obj = datetime.strptime(date_str_cleaned.strip(), '%d %B %Y') except ValueError as ve: print(f"IASGyan: Error parsing date '{date_str_cleaned}': {ve}") continue if article_date_obj >= cutoff_date_iasgyan: articles_for_date = [] for link_item in article_links_list: articles_for_date.append({ 'title': link_item.text.strip(), 'url': link_item['href'] if link_item.get('href') else '#' }) if articles_for_date: current_affairs_data.append({ 'date': article_date_obj, 'articles': articles_for_date }) current_affairs_data.sort(key=lambda x: x['date'], reverse=True) except Exception as e: print(f"Error scraping IASGyan Current Affairs: {e}") return current_affairs_data async def scrape_AIR_sansad_tv_summaries_Iasgyan(): summaries_data = [] try: url = "https://www.iasgyan.in/sansad-tv-air-summaries" async with aiohttp.ClientSession() as session: async with session.get(url, timeout=20) as response: if response.status != 200: print(f"Failed to fetch IASGyan Sansad TV & AIR summaries: {response.status}") return [] content = await response.text() soup = BeautifulSoup(content, "html.parser") for summary_block_item in soup.find_all('div', class_='content_bx'): title_tag_sum = summary_block_item.find('div', class_='title').find('a') if summary_block_item.find('div', class_='title') else None date_tag_sum = summary_block_item.find('li', class_='text-muted') description_tag_sum = summary_block_item.find('div', class_='short_descr').find('ol') if summary_block_item.find('div', class_='short_descr') else None read_more_tag_sum = summary_block_item.find('div', class_='readmore_btn').find('a') if summary_block_item.find('div', class_='readmore_btn') else None if not (title_tag_sum and date_tag_sum and description_tag_sum and read_more_tag_sum): continue title_text = title_tag_sum.text.strip() doc_url_sum = title_tag_sum['href'] summary_date_str_raw = " ".join(date_tag_sum.text.strip().split()).replace(',', '') try: if len(summary_date_str_raw.split()[1]) == 3: parsed_date_sum = datetime.strptime(summary_date_str_raw, '%d %b %Y') else: parsed_date_sum = datetime.strptime(summary_date_str_raw, '%d %B %Y') except ValueError: print(f"IASGyan Summaries: Could not parse date '{summary_date_str_raw}' for '{title_text}'") parsed_date_sum = datetime.min summary_points_list = [li.text.strip() for li in description_tag_sum.find_all('li')] summaries_data.append({ 'title': title_text, 'url': doc_url_sum, 'date_obj': parsed_date_sum, 'date': parsed_date_sum.strftime('%d %b %Y') if parsed_date_sum != datetime.min else summary_date_str_raw, 'points': summary_points_list, 'read_more_url': read_more_tag_sum['href'] }) summaries_data.sort(key=lambda x: x['date_obj'], reverse=True) return summaries_data[:3] except Exception as e: print(f"Error scraping IASGyan Sansad TV summaries: {e}") return summaries_data async def scrape_indian_express_articles(): articles_data = [] try: url = "https://indianexpress.com/section/upsc-current-affairs/upsc-essentials/" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' } async with aiohttp.ClientSession(headers=headers) as session: async with session.get(url, timeout=20) as response: if response.status != 200: print(f"Failed to fetch Indian Express UPSC articles: {response.status}") return [] content = await response.text() soup = BeautifulSoup(content, "html.parser") one_week_ago_cutoff = datetime.now(timezone.utc) - timedelta(days=7) for article_div in soup.find_all('div', class_='articles'): try: context_div = article_div.find('div', class_='img-context') if not context_div: continue title_tag = context_div.find('h2', class_='title').find('a') if not title_tag: continue title_text = title_tag.text.strip() doc_url = title_tag['href'] date_div = context_div.find('div', class_='date') date_str_raw = date_div.text.strip() if date_div else "" summary_tag = context_div.find('p') summary_text = summary_tag.text.strip() if summary_tag else "" snaps_div = article_div.find('div', class_='snaps') image_url = None if snaps_div: img = snaps_div.find('img') if img: image_url = img.get('src') or img.get('data-src') article_date_obj = None clean_date_str = date_str_raw.replace('IST', '').strip() clean_date_str = re.sub(r'\s+', ' ', clean_date_str) try: article_date_obj = datetime.strptime(clean_date_str, '%B %d, %Y %H:%M') except ValueError: try: article_date_obj = datetime.strptime(clean_date_str, '%B %d, %Y') except ValueError: continue article_date_obj = article_date_obj.replace(tzinfo=timezone.utc) if article_date_obj >= one_week_ago_cutoff: articles_data.append({ 'title': title_text, 'url': doc_url, 'image_url': image_url, 'date': date_str_raw, 'summary': summary_text, 'date_obj': article_date_obj }) except Exception as inner_e: print(f"Error parsing specific IE article: {inner_e}") continue articles_data.sort(key=lambda x: x['date_obj'], reverse=True) except Exception as e: print(f"Error scraping Indian Express articles: {e}") return articles_data async def scrape_full_article(url): try: async with aiohttp.ClientSession() as session: async with session.get(url, timeout=20) as response: if response.status != 200: print(f"Failed to fetch article content from {url}: {response.status}") return None content = await response.text() soup = BeautifulSoup(content, "html.parser") content_div = soup.find('div', id='pcl-full-content') if not content_div: content_div = soup.find('article') or soup.find('main') or soup.find('div', class_=re.compile(r'content|article-body|story')) if not content_div: print(f"Could not find main content container for {url}") return f"

Content not found. Please visit original article.

" title_tag_full = soup.find(['h1', 'h2'], class_=re.compile(r'title|headline')) if not title_tag_full: title_tag_full = soup.find('h1') title_text_full = title_tag_full.get_text(strip=True) if title_tag_full else "Article" author_date_div = soup.find(['div','span'], class_=re.compile(r'editor|author|date|byline|meta')) author_date_text_full = author_date_div.get_text(separator=" ", strip=True) if author_date_div else "" elements = content_div.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'blockquote', 'figure', 'table']) formatted_content_parts = [] for elem in elements: if elem.name in ['h1','h2','h3','h4','h5','h6']: formatted_content_parts.append(f"<{elem.name}>{elem.get_text(strip=True)}") elif elem.name == 'p': if elem.find_parent('li'): continue formatted_content_parts.append(f"

{elem.get_text(separator=' ', strip=True)}

") elif elem.name in ['ul', 'ol']: list_items_html = "".join([f"
  • {li_item.get_text(separator=' ', strip=True)}
  • " for li_item in elem.find_all('li', recursive=False)]) formatted_content_parts.append(f"<{elem.name}>{list_items_html}") elif elem.name == 'blockquote': formatted_content_parts.append(f"
    {elem.get_text(separator=' ', strip=True)}
    ") elif elem.name == 'figure': img = elem.find('img') caption = elem.find('figcaption') if img and 'src' in img.attrs: img_html = f"{img.get(" if caption: img_html += f"
    {caption.get_text(strip=True)}
    " formatted_content_parts.append(f"
    {img_html}
    ") elif elem.name == 'table': table_html = "" for tr in elem.find_all('tr'): table_html += "" for th_td in tr.find_all(['th', 'td']): table_html += f"<{th_td.name}>{th_td.get_text(strip=True)}" table_html += "" table_html += "
    " formatted_content_parts.append(table_html) full_article_html = f"

    {title_text_full}

    " if author_date_text_full: full_article_html += f"
    {author_date_text_full}
    " full_article_html += "\n".join(formatted_content_parts) return full_article_html except Exception as e: print(f"Error scraping full article from {url}: {e}") return f"

    Error loading article content. Please visit original article.

    " @app.route('/article/') async def show_article(url): if not url.startswith('http'): print(f"Warning: URL '{url}' might be partial. Assuming it's complete.") full_content_html = await scrape_full_article(url) if full_content_html is None: full_content_html = f"

    Failed to fetch article content for {url}.

    " return render_template('full_article.html', content=full_content_html) async def scrape_insights_articles(): articles_data_insights = [] try: url = 'https://www.insightsonindia.com/upsc-mains-answer-writing-2025-insights-ias/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' } async with aiohttp.ClientSession(headers=headers) as session: async with session.get(url, timeout=20) as response: if response.status != 200: print(f"Failed to fetch Insights Answer Writing links: {response.status}") return [] content = await response.text() soup = BeautifulSoup(content, 'html.parser') count = 0 for div_block in soup.find_all('div', class_='list_div'): if count >= 2: break ul_tag = div_block.find('ul', class_='lcp_catlist') if ul_tag: for li in ul_tag.find_all('li'): a_tag = li.find('a') if a_tag and a_tag.get('href'): title = a_tag.text.strip() link = a_tag['href'] articles_data_insights.append({ 'title': title, 'link': link }) count += 1 except Exception as e: print(f"Error scraping Insights Answer Writing links: {e}") return articles_data_insights async def scrape_full_article_insight(article_url): filtered_content_parts = [] try: async with aiohttp.ClientSession() as session: async with session.get(article_url, timeout=20) as response: if response.status != 200: print(f"Failed to fetch Insights article content from {article_url}: {response.status}") return None content = await response.text() soup = BeautifulSoup(content, 'html.parser') article_body = soup.find('div', class_=re.compile(r'entry-content|article-content|post-content')) if not article_body: print(f"Insights: Could not find article body for {article_url}") return [{'type': 'p', 'text': 'Article content not found.'}] current_section_text = None for tag_item in article_body.find_all(['h1','h2','h3', 'h4', 'p', 'ul', 'ol', 'blockquote', 'table']): if tag_item.name in ['h1','h2','h3','h4']: current_section_text = tag_item.text.strip() filtered_content_parts.append({'type': tag_item.name, 'text': current_section_text}) elif tag_item.name == 'p': filtered_content_parts.append({'type': 'p', 'text': tag_item.text.strip()}) elif tag_item.name in ['ul', 'ol']: items = [li.get_text(strip=True) for li in tag_item.find_all('li')] if items: filtered_content_parts.append({'type': 'list', 'ordered': tag_item.name == 'ol', 'items': items}) elif tag_item.name == 'blockquote': filtered_content_parts.append({'type': 'blockquote', 'text': tag_item.get_text(strip=True)}) elif tag_item.name == 'table': rows = [] for tr in tag_item.find_all('tr'): cells = [td.get_text(strip=True) for td in tr.find_all(['th', 'td'])] rows.append(cells) if rows: filtered_content_parts.append({'type': 'table', 'rows': rows}) except Exception as e: print(f"Error scraping full Insights article from {article_url}: {e}") return [{'type': 'p', 'text': f'Error loading article: {e}'}] return filtered_content_parts @app.route('/article_insight/') async def show_article_insight(url): full_content_data = await scrape_full_article_insight(url) if not full_content_data: return "Failed to load the Insights article content.", 404 return render_template('full_article_insight.html', content=full_content_data) async def scrape_orf_articles(): orf_articles_data = [] try: url = 'https://www.orfonline.org/content-type/issue-briefs' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' } async with aiohttp.ClientSession(headers=headers) as session: async with session.get(url, timeout=20) as response: if response.status != 200: print(f"Failed to fetch ORF articles from {url}: {response.status}") return [] content = await response.text() soup = BeautifulSoup(content, 'html.parser') cutoff_orf = datetime.now(timezone.utc) - timedelta(days=45) potential_articles = soup.find_all('div', class_=re.compile(r'col-|card|item|listing|post')) for article_block in potential_articles: title_tag = article_block.find(['h2', 'h3']) if not title_tag: continue title_text = title_tag.get_text(strip=True) if not title_text or len(title_text) < 10: continue link_tag = title_tag.find('a') if not link_tag: link_tag = article_block.find('a') if not link_tag or not link_tag.get('href'): continue doc_url = link_tag['href'] if not doc_url.startswith('http'): doc_url = f"https://www.orfonline.org{doc_url}" if doc_url.startswith('/') else f"https://www.orfonline.org/{doc_url}" date_tag = article_block.find('time') or article_block.find(class_=re.compile(r'date|meta|time')) article_date_obj = None if date_tag: date_str = date_tag.get_text(strip=True) try: article_date_obj = datetime.strptime(date_str, "%b %d, %Y").replace(tzinfo=timezone.utc) except ValueError: try: article_date_obj = datetime.strptime(date_str, "%d %B %Y").replace(tzinfo=timezone.utc) except ValueError: pass if not article_date_obj: article_date_obj = datetime.now(timezone.utc) if article_date_obj >= cutoff_orf: desc_tag = article_block.find('p') desc_text = desc_tag.get_text(strip=True) if desc_tag else "" if any(a['link'] == doc_url for a in orf_articles_data): continue orf_articles_data.append({ 'title': title_text, 'link': doc_url, 'date_obj': article_date_obj, 'date': article_date_obj.strftime('%B %d, %Y'), 'description': desc_text, 'author': "ORF" }) except Exception as e: print(f"Error scraping ORF articles: {e}") seen = set() unique_data = [] for d in orf_articles_data: if d['link'] not in seen: seen.add(d['link']) unique_data.append(d) unique_data.sort(key=lambda x: x['date_obj'], reverse=True) return unique_data async def scrape_forumias(url_path="7pm"): url = f"https://forumias.com/blog/{url_path}/" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } async with aiohttp.ClientSession(headers=headers) as session: async with session.get(url, timeout=15) as response: if response.status != 200: return [] html = await response.text() soup = BeautifulSoup(html, "html.parser") sections_data = [] articles_list = [] date_groups = soup.find_all('div', class_='cat-archive-date-group') for group in date_groups: date_div = group.find('div', class_='post-date') date_text = date_div.get_text(" ", strip=True) if date_div else "" links = group.find_all('a') for a in links: articles_list.append({ 'title': a.get_text(strip=True), 'url': a.get('href'), 'date': date_text }) if articles_list: sections_data.append({ 'section': f"ForumIAS {url_path.upper()} Editorials", 'articles': articles_list }) return sections_data async def scrape_forumias_combined(): results = await asyncio.gather( scrape_forumias("7pm"), scrape_forumias("9pm") ) return [item for sublist in results for item in sublist] @app.route('/forumias') async def forumias(): scraped_sections = await scrape_forumias_combined() return render_template('forumias.html', sections=scraped_sections) @app.route('/forumias/
    ') async def forumias_section(section): if section not in ['7pm', '9pm']: return "Invalid section", 404 scraped_sections = await scrape_forumias(section) return render_template('forumias.html', sections=scraped_sections) @app.route('/TH_article/') async def show_th_article(url): article_content_data = await scrape_TH_learning(url) if not article_content_data: return "Failed to load The Hindu Learning Corner article content.", 404 return render_template('article_content.html', content=article_content_data) async def scrape_TH_learning(article_url_th): article_content_th = [] try: async with aiohttp.ClientSession() as session: async with session.get(article_url_th, timeout=20) as response: if response.status != 200: print(f"Failed to fetch TH Learning article from {article_url_th}: {response.status}") return None content = await response.text() soup = BeautifulSoup(content, "html.parser") main_content_area = soup.find('div', class_=re.compile(r'articlebody|content|story-body')) if not main_content_area: main_content_area = soup for tag_item_th in main_content_area.find_all(['h1','h2','h3', 'h4', 'p', 'ul', 'ol']): if tag_item_th.name in ['h1','h2','h3','h4']: article_content_th.append({'type': tag_item_th.name, 'text': tag_item_th.text.strip()}) elif tag_item_th.name == 'p': article_content_th.append({'type': 'p', 'text': tag_item_th.text.strip()}) elif tag_item_th.name in ['ul', 'ol']: items = [li.get_text(strip=True) for li in tag_item_th.find_all('li')] if items: article_content_th.append({'type':'list', 'ordered': tag_item_th.name=='ol', 'items':items}) if not article_content_th and main_content_area == soup: print(f"TH Learning: No specific content tags found on {article_url_th}, page might be structured differently.") except Exception as e: print(f"Error scraping TH Learning article from {article_url_th}: {e}") return None return article_content_th if __name__ == '__main__': app.run(debug=True)