from flask import Flask, render_template, request, redirect, url_for, jsonify
from flask_caching import Cache
import aiohttp
import asyncio
from bs4 import BeautifulSoup, NavigableString
#from playwright.async_api import async_playwright
from datetime import datetime, timedelta, timezone
import os
import calendar
import praw
import re
#from dotenv import load_dotenv

#load_dotenv()
API_KEY = os.environ.get('YT_API_KEY')
REDDIT_CLIENT_ID = os.environ.get('REDDIT_CLIENT_ID')
REDDIT_CLIENT_SECRET = os.environ.get('REDDIT_API_KEY')
REDDIT_USER_AGENT = "user age by u/Hemanth"

app = Flask(__name__)
cache = Cache(app, config={'CACHE_TYPE': 'simple'})

YOUTUBE_API_URL = 'https://www.googleapis.com/youtube/v3/playlistItems'
PLAYLIST_API_URL = 'https://www.googleapis.com/youtube/v3/playlists'
WATCHED_VIDEOS_FILE = 'watched_videos.txt'
PLAYLISTS_FILE = 'playlists.txt'
LISTENED_EPISODES_FILE = 'listened_episodes.txt'


async def fetch_videos_from_playlist(session, playlist_id):
    params = {
        'part': 'snippet',
        'playlistId': playlist_id,
        'maxResults': 50,
        'key': API_KEY
    }
    try:
        async with session.get(YOUTUBE_API_URL, params=params) as response:
            if response.status != 200:
                print(f"Failed to fetch data for playlist {playlist_id}: {response.status} {await response.text()}")
                return []
            data = await response.json()
            return data.get('items', [])
    except Exception as e:
        print(f"Error in fetch_videos_from_playlist for {playlist_id}: {e}")
        return []


async def fetch_playlist_title(session, playlist_id):
    params = {
        'part': 'snippet',
        'id': playlist_id,
        'key': API_KEY,
    }
    try:
        async with session.get(PLAYLIST_API_URL, params=params) as response:
            if response.status == 200:
                data = await response.json()
                if data.get('items'):
                    return {
                        'id': data['items'][0]['id'],
                        'title': data['items'][0]['snippet']['title'],
                        'channel_title': data['items'][0]['snippet']['channelTitle']
                    }
            else:
                print(f"Failed to fetch playlist title for {playlist_id}: {response.status} {await response.text()}")
    except Exception as e:
        print(f"Error in fetch_playlist_title for {playlist_id}: {e}")
    return None
 

reddit = praw.Reddit(
    client_id=REDDIT_CLIENT_ID,
    client_secret=REDDIT_CLIENT_SECRET,
    user_agent=REDDIT_USER_AGENT
)


SUBREDDITS = ['UPSC','SideProject', 'datascience','explainlikeimfive','Krishnamurti','ycombinator','OpenAI','programming','AskReddit', 'worldnews', 'politics']

@app.route('/get_posts', methods=['POST'])
def get_posts():
    data = request.json
    selected_subreddits = data.get('subreddits', [])
    sort_method = data.get('sort', 'hot') 
    limit = int(data.get('limit', 10))

    all_posts = []
    for subreddit_name in selected_subreddits:
        try:
            subreddit = reddit.subreddit(subreddit_name)
            if sort_method == 'top':
                posts_iterable = subreddit.top(limit=limit)
            elif sort_method == 'new':
                posts_iterable = subreddit.new(limit=limit)
            else:
                posts_iterable = subreddit.hot(limit=limit)

            for post in posts_iterable:
                all_posts.append({
                    'subreddit': subreddit_name,
                    'title': post.title,
                    'url': post.url,
                    'author': post.author.name if post.author else '[deleted]',
                    'score': post.score,
                    'num_comments': post.num_comments,
                    'created_utc': post.created_utc,
                    'selftext': post.selftext[:300] + '...' if len(post.selftext) > 300 else post.selftext,
                    'link': f"https://reddit.com{post.permalink}",
                    'flair': post.link_flair_text if post.link_flair_text else 'No flair',
                    'nsfw': post.over_18,
                    'spoiler': post.spoiler
                })
        except Exception as e:
            print(f"Error fetching posts from subreddit {subreddit_name}: {e}")

    return jsonify(all_posts)

def filter_videos_by_date(videos, days=5):
    recent_videos = []
    if not videos: return [] 
    cutoff_date = datetime.now(timezone.utc) - timedelta(days=days) 
    for video in videos:
        try:
            publish_date = datetime.strptime(video['snippet']['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
            if publish_date >= cutoff_date:
                recent_videos.append({
                    'videoId': video['snippet']['resourceId']['videoId'],
                    'title': video['snippet']['title'],
                    'url': f"https://www.youtube.com/watch?v={video['snippet']['resourceId']['videoId']}",
                    'publishedAt': publish_date
                })
        except (KeyError, TypeError, ValueError) as e:
            print(f"Error processing video data: {video.get('snippet', {}).get('title', 'Unknown Video')}. Error: {e}")
            continue
    return recent_videos

def load_watched_videos():
    if os.path.exists(WATCHED_VIDEOS_FILE):
        with open(WATCHED_VIDEOS_FILE, 'r') as file:
            return {line.strip() for line in file}
    return set()

def save_watched_videos(video_ids):
    try:
        with open(WATCHED_VIDEOS_FILE, 'a') as file:
            for video_id in video_ids:
                file.write(f"{video_id}\n")
    except IOError as e:
        print(f"Error saving watched videos: {e}")


def filter_unseen_videos(videos):
    if not videos: return []
    watched_videos = load_watched_videos()
    return [video for video in videos if video['videoId'] not in watched_videos]

async def load_playlists(session):
    playlists_data = []
    if os.path.exists(PLAYLISTS_FILE):
        try:
            with open(PLAYLISTS_FILE, 'r') as file:
                playlist_ids = [line.strip() for line in file]
            
            tasks = [fetch_playlist_title(session, playlist_id) for playlist_id in playlist_ids]
            playlist_details_results = await asyncio.gather(*tasks, return_exceptions=True)
            
            for detail in playlist_details_results:
                if isinstance(detail, Exception):
                    print(f"Error fetching a playlist detail: {detail}")
                elif detail:
                    playlists_data.append(detail)
        except Exception as e:
            print(f"Error loading playlists from file or fetching details: {e}")
    return playlists_data


def add_playlist(playlist_id):
    try:
        with open(PLAYLISTS_FILE, 'a') as file:
            file.write(f"{playlist_id}\n")
    except IOError as e:
        print(f"Error adding playlist to file: {e}")

async def scrape_air_content(url, title_filter):
    episodes = []
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as response:
                if response.status != 200:
                    print(f"Failed to fetch data from {url}: {response.status}")
                    return []
                content = await response.text()
                soup = BeautifulSoup(content, "html.parser")
                
                table = soup.find('table', class_='table')
                if not table:
                    print(f"Table not found on {url}")
                    return []
                
                rows = table.find_all('tr')[1:] 
                for row in rows:
                    cols = row.find_all('td')
                    if len(cols) < 4:
                        continue
                    
                    title = cols[0].text.strip()
                    date_str = cols[1].text.strip()
                    time_str = cols[2].text.strip()
                    
                    audio_tag = cols[3].find('audio')
                    audio_src = None
                    if audio_tag:
                        source_tag = audio_tag.find('source')
                        if source_tag and 'src' in source_tag.attrs:
                            audio_src = source_tag['src']

                    if not audio_src:
                        continue

                    try:
                        date_obj = datetime.strptime(f"{date_str} {time_str}", '%d %b %Y %H:%M')
                    except ValueError:
                        print(f"Could not parse date for AIR episode: {date_str} {time_str}")
                        continue
                    
                    if title in title_filter:
                        episodes.append({
                            'title': title,
                            'date': date_obj,
                            'audio_link': audio_src
                        })
    except Exception as e:
        print(f"Error scraping AIR content from {url} for titles {title_filter}: {e}")
    return episodes

async def scrape_air_spotlight():
    return await scrape_air_content("https://www.newsonair.gov.in/listen-broadcast-category/daily-broadcast/", ["Spotlight"])

async def scrape_air_insight():
    return await scrape_air_content("https://www.newsonair.gov.in/listen-broadcast-category/weekly-broadcast/", ["Insight", "Insights"])

async def scrape_air_economy():
    return await scrape_air_content("https://www.newsonair.gov.in/listen-broadcast-category/weekly-broadcast/", ["Money Talk"])

async def scrape_current_affairs_air():
    return await scrape_air_content("https://www.newsonair.gov.in/listen-broadcast-category/weekly-broadcast/", ["Current Affairs"])

def filter_recent_episodes(episodes, days=3):
    if not episodes: return []
    cutoff_date = datetime.now() - timedelta(days=days)
    return [episode for episode in episodes if episode['date'] >= cutoff_date]


def load_listened_episodes():
    if os.path.exists(LISTENED_EPISODES_FILE):
        with open(LISTENED_EPISODES_FILE, 'r') as file:
            return {line.strip() for line in file}
    return set()

def save_listened_episodes(episode_links):
    try:
        with open(LISTENED_EPISODES_FILE, 'a') as file:
            for link in episode_links:
                file.write(f"{link}\n")
    except IOError as e:
        print(f"Error saving listened episodes: {e}")


def filter_unheard_episodes(episodes):
    if not episodes: return []
    listened_episodes = load_listened_episodes()
    return [episode for episode in episodes if episode.get('audio_link') not in listened_episodes]


async def scrape_pib_asp_net(url, ministry=None, year=None, month=None, day=None):
    results = []
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Origin': 'https://www.pib.gov.in',
            'Referer': url
        }

        async with aiohttp.ClientSession(headers=headers) as session:
            async with session.get(url, ssl=False, timeout=30) as response:
                if response.status != 200:
                    return []
                html = await response.text()
                soup = BeautifulSoup(html, "html.parser")

            form_data = {}
            for input_tag in soup.find_all('input'):
                if input_tag.get('name'):
                    form_data[input_tag.get('name')] = input_tag.get('value', '')

            ministry_val = ministry if ministry and ministry != '0' else '0'
            year_val = year if year else '2024'
            month_val = month if month else '0'
            day_val = day if day else '0'
            overrides = {
                '__EVENTTARGET': 'ctl00$ContentPlaceHolder1$ddlYear',
                '__EVENTARGUMENT': '',
                'ctl00$ContentPlaceHolder1$ddlMinistry': ministry_val,
                'ctl00$ContentPlaceHolder1$ddlYear': year_val,
                'ctl00$ContentPlaceHolder1$ddlMonth': month_val,
                'ctl00$ContentPlaceHolder1$ddlday': day_val,
                'ctl00$ContentPlaceHolder1$ddlSector': '0', 
            }
            form_data.update(overrides)

            async with session.post(url, data=form_data, ssl=False, timeout=30) as post_response:
                if post_response.status != 200:
                    return []
                post_content = await post_response.text()
                post_soup = BeautifulSoup(post_content, "html.parser")

                content_area = post_soup.find('div', class_='content-area')
                if content_area:
                    for li in content_area.find_all('li'):
                        link_tag = li.find('a')
                        date_span = li.find('span', class_='publishdatesmall')
                        
                        if link_tag and date_span:
                            href = link_tag.get('href', '')
                            if href and not href.startswith('http'):
                                if href.startswith('/'):
                                    href = f"https://www.pib.gov.in{href}"
                                else:
                                    href = f"https://www.pib.gov.in/{href}"

                            results.append({
                                'title': link_tag.text.strip(),
                                'url': href,
                                'date': date_span.text.replace('Posted on:', '').strip()
                            })

    except Exception as e:
        print(f"PIB scraping error: {e}")
    
    return results

async def scrape_pib(ministry=None, year=None, month=None, day=None):
    url = "https://www.pib.gov.in/ViewBackgrounder.aspx?MenuId=51&reg=3&lang=1"
    data = await scrape_pib_asp_net(url, ministry, year, month, day)
    if data ==[]:
        url1 = "https://www.pib.gov.in/ViewBackgrounder.aspx?MenuId=51"
        data = await scrape_pib_asp_net(url1, ministry, year, month, day)
    return {'Backgrounders': data}

async def scrape_pib_facts(ministry=None, year=None, month=None, day=None):
    url = "https://www.pib.gov.in/AllFactsheet.aspx?MenuId=12&reg=3&lang=1"
    data = await scrape_pib_asp_net(url, ministry, year, month, day)
    if data ==[]:
        url1 = "https://www.pib.gov.in/ViewBackgrounder.aspx?MenuId=51"
        data = await scrape_pib_asp_net(url1, ministry, year, month, day)

    return {'Backgrounders': data}


TH_url = "https://learningcorner.epaper.thehindu.com/articles" 


@app.route('/')
@cache.cached(timeout=300)
async def index():
    playlist_data = []
    spotlight_episodes, insight_episodes, economy_episodes, current_episodes_air = [], [], [], []
    indian_express_articles_data, orf_articles_data, sansad_tv_summaries_data = [], [], []
    pib_backgrounders_result, pib_facts_result, forum_ca_result = {}, {}, []
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

    async with aiohttp.ClientSession(headers=headers) as session:
        try:
            loaded_playlists = await load_playlists(session) 
            for playlist_item in loaded_playlists: 
                try:
                    videos = await fetch_videos_from_playlist(session, playlist_item['id'])
                    recent_videos = filter_videos_by_date(videos, days=5)
                    unseen_videos = filter_unseen_videos(recent_videos)
                    playlist_data.append({
                        'id': playlist_item['id'],
                        'title': playlist_item['title'],
                        'unseen_count': len(unseen_videos),
                        'channel': playlist_item['channel_title']
                    })
                except Exception as e:
                    print(f"Error processing playlist {playlist_item.get('id', 'N/A')}: {e}")
            playlist_data.sort(key=lambda x: x['unseen_count'], reverse=True)
        except Exception as e:
            print(f"Error loading or processing playlists: {e}")

        tasks_to_run = [
            scrape_air_spotlight(),
            scrape_air_insight(),
            scrape_air_economy(),
            scrape_current_affairs_air(),
            scrape_indian_express_articles(),
            scrape_orf_articles(),
            scrape_AIR_sansad_tv_summaries_Iasgyan(),
            scrape_pib(), 
            scrape_pib_facts(),
            scrape_forumias(),
            scrape_insights_articles() 
        ]
        forum_ca = await scrape_forumias_combined()
        results = await asyncio.gather(*tasks_to_run, return_exceptions=True)

        if not isinstance(results[0], Exception): spotlight_episodes = results[0]
        else: print(f"Error scraping AIR Spotlight: {results[0]}")

        if not isinstance(results[1], Exception): insight_episodes = results[1]
        else: print(f"Error scraping AIR Insight: {results[1]}")

        if not isinstance(results[2], Exception): economy_episodes = results[2]
        else: print(f"Error scraping AIR Economy: {results[2]}")

        if not isinstance(results[3], Exception): current_episodes_air = results[3]
        else: print(f"Error scraping AIR Current Affairs: {results[3]}")

        if not isinstance(results[4], Exception): indian_express_articles_data = results[4]
        else: print(f"Error scraping Indian Express: {results[4]}")

        if not isinstance(results[5], Exception): orf_articles_data = results[5]
        else: print(f"Error scraping ORF articles: {results[5]}")

        if not isinstance(results[6], Exception): sansad_tv_summaries_data = results[6]
        else: print(f"Error scraping Sansad TV Summaries (IASGyan): {results[6]}")
        
        if not isinstance(results[7], Exception): pib_backgrounders_result = results[7]
        else: print(f"Error scraping PIB Backgrounders: {results[7]}")

        if not isinstance(results[8], Exception): pib_facts_result = results[8]
        else: print(f"Error scraping PIB Facts: {results[8]}")
        
        if not isinstance(results[9], Exception): forum_ca_result = results[9]
        else: print(f"Error scraping ForumIAS CA: {results[9]}")
        if not isinstance(results[9], Exception): insights_articles_result  = results[10]
        else: print(f"Error scraping ForumIAS CA: {results[10]}")

    spotlight_unheard = filter_unheard_episodes(filter_recent_episodes(spotlight_episodes or [], days=5))
    insight_unheard = filter_unheard_episodes(filter_recent_episodes(insight_episodes or [], days=5))
    economy_unheard = filter_unheard_episodes(filter_recent_episodes(economy_episodes or [], days=5))
    current_unheard_air = filter_unheard_episodes(filter_recent_episodes(current_episodes_air or [], days=10))


    return render_template('index.html', 
                           playlists=playlist_data, 
                           spotlight_unheard_count=len(spotlight_unheard), 
                           insight_unheard_count=len(insight_unheard),
                           economy_unheard_count=len(economy_unheard),
                           current_unheard_count=len(current_unheard_air), 
                           sansad_tv_summaries=sansad_tv_summaries_data or [], 
                           pib_backgrounders=pib_backgrounders_result or {}, 
                           subreddits=SUBREDDITS,
                           articles=indian_express_articles_data or [], 
                           orfarticles=orf_articles_data or [],  
                           pibfacts=pib_facts_result or {},
                           insightarticles=insights_articles_result or [],
                           forum_ca=forum_ca
                           ) 

@app.route('/unseen_videos/<playlist_id>')
async def unseen_videos(playlist_id):
    unseen_vids = [] 
    try:
        async with aiohttp.ClientSession() as session:
            videos = await fetch_videos_from_playlist(session, playlist_id)
            recent_videos = filter_videos_by_date(videos, days=5)
            unseen_vids = filter_unseen_videos(recent_videos)
    except Exception as e:
        print(f"Error in /unseen_videos/{playlist_id}: {e}")
    return render_template('unseen_videos.html', videos=unseen_vids, playlist_id=playlist_id)

@app.route('/pib')
async def pibscrap():
    ministry = request.args.get('ministry','0') 
    year = request.args.get('year','2024')
    month = request.args.get('month','0')
    day = request.args.get('day','0')

    pib_data = await scrape_pib(ministry=ministry, year=year, month=month, day=day)
    return render_template('pib.html', pib_backgrounders=pib_data or {})

@app.route('/pib_facts')
async def pibscrapfacts():
    ministry = request.args.get('ministry', '0') 
    year = request.args.get('year', '2024')
    month = request.args.get('month', '0')
    day = request.args.get('day', '0')

    pib_data = await scrape_pib_facts(ministry=ministry, year=year, month=month, day=day)
    return render_template('pib_facts.html', pib_backgrounders=pib_data or {}) 

@app.route('/mark_watched', methods=['POST'])
def mark_watched():
    video_ids = request.form.getlist('video_ids')
    save_watched_videos(video_ids)
    playlist_id = request.form.get('playlist_id')
    if playlist_id:
        return redirect(url_for('unseen_videos', playlist_id=playlist_id))
    return redirect(url_for('index'))


@app.route('/add_playlist', methods=['GET', 'POST'])
def add_playlist_route():
    if request.method == 'POST':
        playlist_id = request.form.get('playlist_id', '').strip()
        if playlist_id:
            add_playlist(playlist_id)
        return redirect(url_for('index'))
    return render_template('add_playlist.html')

async def _render_air_episodes_page(scrape_function, days_filter, template_name='spotlight.html'):
    episodes_data = []
    try:
        raw_episodes = await scrape_function()
        recent_episodes_data = filter_recent_episodes(raw_episodes or [], days=days_filter)
        episodes_data = filter_unheard_episodes(recent_episodes_data or [])
    except Exception as e:
        print(f"Error in AIR episodes route for {scrape_function.__name__}: {e}")
    return render_template(template_name, episodes=episodes_data)

@app.route('/spotlight')
async def spotlight():
    return await _render_air_episodes_page(scrape_air_spotlight, 5)

@app.route('/Insight') 
async def insight():
    return await _render_air_episodes_page(scrape_air_insight, 5)

@app.route('/aireconomy')
async def aireconomy():
    return await _render_air_episodes_page(scrape_air_economy, 5)

@app.route('/aircurrentaffairs')
async def airCA():
    return await _render_air_episodes_page(scrape_current_affairs_air, 10)


@app.route('/mark_listened', methods=['POST'])
def mark_listened():
    episode_links = request.form.getlist('episode_links')
    save_listened_episodes(episode_links)
    return redirect(request.referrer or url_for('index'))


BASE_URL_MEA = "https://www.mea.gov.in/bilateral-documents.htm" 

async def fetch_page_mea(session, url): # Renamed
    try:
        async with session.get(url, timeout=20) as response: 
            if response.status != 200:
                print(f"Failed to fetch MEA page {url}: {response.status}")
                return None
            return await response.text()
    except Exception as e:
        print(f"Error fetching MEA page {url}: {e}")
        return None

async def parse_page_mea(content, days_ago_cutoff_date):
    documents = []
    continue_scraping = True 
    if not content:
        return documents, False 

    soup = BeautifulSoup(content, "html.parser")
    item_list = soup.find('ul', class_='commonListing')
    
    if item_list:
        for item in item_list.find_all('li'):
            title_link = item.find('a', class_='searchContent')
            date_container = item.find('span', class_='date') 
            
            if title_link and date_container:
                title = title_link.text.strip()
                doc_url = title_link['href']
                if not doc_url.startswith('http'):
                    doc_url = f"https://www.mea.gov.in{doc_url}" if doc_url.startswith('/') else f"https://www.mea.gov.in/{doc_url}"
                
                date_str_raw = date_container.text.strip() 
                
                try:
                    date_obj = datetime.strptime(date_str_raw, "%B %d, %Y").replace(tzinfo=timezone.utc)
                    if date_obj >= days_ago_cutoff_date:
                        documents.append({
                            'title': title,
                            'url': doc_url,
                            'date': date_obj.strftime("%B %d, %Y") 
                        })
                    else:
                        continue_scraping = False 
                        break 
                except ValueError:
                    print(f"Error parsing MEA date: {date_str_raw} for title '{title}'")
    else:
        print("MEA commonListing not found.")
        continue_scraping = False 

    return documents, continue_scraping

def get_next_page_url_mea(content):
    if not content: return None
    soup = BeautifulSoup(content, "html.parser")
    next_link = soup.find('a', class_='next')
    if next_link and 'href' in next_link.attrs:
        href = next_link['href']
        if href.startswith('http'):
            return href
        elif href.startswith('/'):
            return f"https://www.mea.gov.in{href}"
        else:
            return f"https://www.mea.gov.in/bilateral-documents/{href}"
    return None

async def scrape_bilateral_documents():
    all_documents = []
    try:
        async with aiohttp.ClientSession() as session:
            current_url = f"{BASE_URL_MEA}?53/Bilateral/Multilateral_Documents" 
            days_ago_cutoff_date = (datetime.now(timezone.utc) - timedelta(days=90)).replace(hour=0, minute=0, second=0, microsecond=0)

            page_count = 0
            max_pages = 10

            while current_url and page_count < max_pages:
                page_count += 1
                print(f"Scraping MEA page {page_count}: {current_url}")
                content = await fetch_page_mea(session, current_url)
                if not content:
                    break

                documents_on_page, continue_scraping = await parse_page_mea(content, days_ago_cutoff_date)
                all_documents.extend(documents_on_page)

                if not continue_scraping:
                    print("Stopping MEA scraping based on date or parsing issue.")
                    break
                
                current_url = get_next_page_url_mea(content)
                if not current_url:
                    print("No next page found for MEA.")
                    break
                await asyncio.sleep(1)
    except Exception as e:
        print(f"Error during MEA bilateral documents scraping: {e}")
    return all_documents

@app.route('/MEAsite')
@cache.cached(timeout=3600) 
async def bilateral_documents():
    documents_data = await scrape_bilateral_documents()
    return render_template('bilateral_documents.html', documents=documents_data or [])

async def scrape_prs_india():
    cards_data = []
    try:
        url = "https://prsindia.org"
        async with aiohttp.ClientSession() as session:
            async with session.get(url, timeout=20) as response:
                if response.status != 200:
                    print(f"Failed to fetch PRS India data: {response.status}")
                    return []
                content = await response.text()
                soup = BeautifulSoup(content, "html.parser")
                
                right_banner = soup.find('div', class_='right-banner') 
                if right_banner:
                    for item in right_banner.find_all(['div','section'], class_=re.compile(r"col-\w*-6|card-item-class")): 
                        image_tag = item.find('img')
                        link_tag = item.find('a')
                        title_tag = item.find(['h3','h4','h5']) 
                        
                        if link_tag and title_tag: 
                            img_src = None
                            if image_tag and 'src' in image_tag.attrs:
                                img_src = image_tag['src']
                                if not img_src.startswith('http'):
                                    img_src = url + img_src if img_src.startswith('/') else url + '/' + img_src
                            
                            link_href = link_tag['href']
                            if not link_href.startswith('http'):
                                link_href = url + link_href if link_href.startswith('/') else url + '/' + link_href

                            cards_data.append({
                                'title': title_tag.text.strip(),
                                'image_url': img_src,
                                'link_url': link_href
                            })
                else:
                    print("PRS India: right-banner not found.")

    except Exception as e:
        print(f"Error scraping PRS India: {e}")
    return cards_data

@app.route('/prsindia')
@cache.cached(timeout=3600) 
async def prs_india():
    scraped_cards = await scrape_prs_india() 
    return render_template('prsindia.html', cards=scraped_cards or [])


async def scrape_prs_bills(session, search_keyword=None, year=None, status=None):
    bills_data = []
    try:
        base_url = "https://prsindia.org/billtrack/category/billtrack"
        params = {}
        
        if search_keyword:
            params['BillActsBillsParliamentSearch[title]'] = search_keyword
        if status:
            params['BillActsBillsParliamentSearch[bill_status_id]'] = status
        if year:
            params['BillActsBillsParliamentSearch[date_of_introduction]'] = year
            
        async with session.get(base_url, params=params, timeout=20) as response:
            if response.status != 200:
                print(f"Failed to fetch PRS India bills data: {response.status}")
                return []
            
            content = await response.text()
            soup = BeautifulSoup(content, "html.parser")
            
           
            for row in soup.find_all('div', class_='views-row'):
                
                title_div = row.find('div', class_='views-field-title-field')
                status_div = row.find('div', class_='views-field-field-bill-status')
                
                if title_div and status_div:
                    title_tag = title_div.find('h3', class_='cate')
                    if title_tag and title_tag.a:
                        bill_url = title_tag.a['href']
                        if not bill_url.startswith('http'):
                            bill_url = f"https://prsindia.org{bill_url}"
                        
                        status_span = status_div.find('span')
                        status_text = status_span.text.strip() if status_span else "Unknown"
                        
                        bills_data.append({
                            'title': title_tag.a.text.strip(),
                            'url': bill_url,
                            'status': status_text
                        })
    except Exception as e:
        print(f"Error scraping PRS India bills: {e}")
    
    return bills_data


@app.route('/prsindia_bills')
async def prs_india_bills():
    search_keyword = request.args.get('search', '')
    year = request.args.get('year', str(datetime.now().year))
    status = request.args.get('status', '')
    
    async with aiohttp.ClientSession() as session:
        bills = await scrape_prs_bills(session, search_keyword, year, status)
    
    return render_template('prsindia_bills.html', 
                          bills=bills, 
                          search_keyword=search_keyword, 
                          year=year,
                          status=status)


async def scrape_current_affairs_iasgyan():
    current_affairs_data = [] 
    try:
        url = "https://www.iasgyan.in/daily-current-affairs"
        async with aiohttp.ClientSession() as session:
            async with session.get(url, timeout=20) as response:
                if response.status != 200:
                    print(f"Failed to fetch IASGyan Current Affairs data: {response.status}")
                    return []
                content = await response.text()
                soup = BeautifulSoup(content, "html.parser")

                cutoff_date_iasgyan = datetime.now() - timedelta(days=6)

                for article_block in soup.find_all('div', class_='shadow mt-4 rounded-2'): 
                    title_tag_ias = article_block.find('h3', class_='fw-semibold text-white m-0 fs-5')
                    article_links_list = article_block.find_all('a', class_='w-100')

                    if title_tag_ias and article_links_list:
                        date_str_match = re.search(r'–\s*(.+)$', title_tag_ias.text.strip())
                        if not date_str_match: continue
                        
                        date_str_cleaned = re.sub(r'(\d+)(st|nd|rd|th|TH|ND|RD|ST)\s*', r'\1 ', date_str_match.group(1).strip())
                        date_str_cleaned = date_str_cleaned.replace("  ", " ") 
                        
                        try:
                            article_date_obj = datetime.strptime(date_str_cleaned.strip(), '%d %B %Y') 
                        except ValueError as ve:
                            print(f"IASGyan: Error parsing date '{date_str_cleaned}': {ve}")
                            continue
                        
                        if article_date_obj >= cutoff_date_iasgyan:
                            articles_for_date = []
                            for link_item in article_links_list:
                                articles_for_date.append({
                                    'title': link_item.text.strip(),
                                    'url': link_item['href'] if link_item.get('href') else '#'
                                })
                            if articles_for_date: 
                                current_affairs_data.append({
                                    'date': article_date_obj, 
                                    'articles': articles_for_date
                                })
        current_affairs_data.sort(key=lambda x: x['date'], reverse=True) 
    except Exception as e:
        print(f"Error scraping IASGyan Current Affairs: {e}")
    return current_affairs_data


async def scrape_AIR_sansad_tv_summaries_Iasgyan():
    summaries_data = []
    try:
        url = "https://www.iasgyan.in/sansad-tv-air-summaries"
        async with aiohttp.ClientSession() as session:
            async with session.get(url, timeout=20) as response:
                if response.status != 200:
                    print(f"Failed to fetch IASGyan Sansad TV & AIR summaries: {response.status}")
                    return []
                content = await response.text()
                soup = BeautifulSoup(content, "html.parser")

                for summary_block_item in soup.find_all('div', class_='content_bx'):
                    title_tag_sum = summary_block_item.find('div', class_='title').find('a') if summary_block_item.find('div', class_='title') else None
                    date_tag_sum = summary_block_item.find('li', class_='text-muted')
                    description_tag_sum = summary_block_item.find('div', class_='short_descr').find('ol') if summary_block_item.find('div', class_='short_descr') else None
                    read_more_tag_sum = summary_block_item.find('div', class_='readmore_btn').find('a') if summary_block_item.find('div', class_='readmore_btn') else None

                    if not (title_tag_sum and date_tag_sum and description_tag_sum and read_more_tag_sum):
                        continue 

                    title_text = title_tag_sum.text.strip()
                    doc_url_sum = title_tag_sum['href']
                    
                    summary_date_str_raw = " ".join(date_tag_sum.text.strip().split()).replace(',', '') 
                    try:
                        if len(summary_date_str_raw.split()[1]) == 3:
                             parsed_date_sum = datetime.strptime(summary_date_str_raw, '%d %b %Y')
                        else: 
                             parsed_date_sum = datetime.strptime(summary_date_str_raw, '%d %B %Y')
                    except ValueError:
                        print(f"IASGyan Summaries: Could not parse date '{summary_date_str_raw}' for '{title_text}'")
                        parsed_date_sum = datetime.min 

                    summary_points_list = [li.text.strip() for li in description_tag_sum.find_all('li')]

                    summaries_data.append({
                        'title': title_text,
                        'url': doc_url_sum,
                        'date_obj': parsed_date_sum, 
                        'date': parsed_date_sum.strftime('%d %b %Y') if parsed_date_sum != datetime.min else summary_date_str_raw,
                        'points': summary_points_list,
                        'read_more_url': read_more_tag_sum['href']
                    })
                
                summaries_data.sort(key=lambda x: x['date_obj'], reverse=True) 
                return summaries_data[:3] 
    except Exception as e:
        print(f"Error scraping IASGyan Sansad TV summaries: {e}")
    return summaries_data 


async def scrape_indian_express_articles():
    articles_data = []
    try:
        url = "https://indianexpress.com/section/upsc-current-affairs/upsc-essentials/"
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }

        async with aiohttp.ClientSession(headers=headers) as session:
            async with session.get(url, timeout=20) as response:
                if response.status != 200:
                    print(f"Failed to fetch Indian Express UPSC articles: {response.status}")
                    return []
                content = await response.text()
                soup = BeautifulSoup(content, "html.parser")
                
                one_week_ago_cutoff = datetime.now(timezone.utc) - timedelta(days=7)
                
                for article_div in soup.find_all('div', class_='articles'):
                    try:
                        context_div = article_div.find('div', class_='img-context')
                        if not context_div: continue

                        title_tag = context_div.find('h2', class_='title').find('a')
                        if not title_tag: continue
                        
                        title_text = title_tag.text.strip()
                        doc_url = title_tag['href']
                        date_div = context_div.find('div', class_='date')
                        date_str_raw = date_div.text.strip() if date_div else ""
                        summary_tag = context_div.find('p')
                        summary_text = summary_tag.text.strip() if summary_tag else ""
                        snaps_div = article_div.find('div', class_='snaps')
                        image_url = None
                        if snaps_div:
                            img = snaps_div.find('img')
                            if img:
                                image_url = img.get('src') or img.get('data-src')
                        article_date_obj = None
                        clean_date_str = date_str_raw.replace('IST', '').strip()
                        clean_date_str = re.sub(r'\s+', ' ', clean_date_str)
                        
                        try:
                            article_date_obj = datetime.strptime(clean_date_str, '%B %d, %Y %H:%M')
                        except ValueError:
                            try:
                                article_date_obj = datetime.strptime(clean_date_str, '%B %d, %Y')
                            except ValueError:
                                continue 
                        article_date_obj = article_date_obj.replace(tzinfo=timezone.utc)

                        if article_date_obj >= one_week_ago_cutoff:
                            articles_data.append({
                                'title': title_text,
                                'url': doc_url,
                                'image_url': image_url,
                                'date': date_str_raw,
                                'summary': summary_text,
                                'date_obj': article_date_obj 
                            })
                    except Exception as inner_e:
                        print(f"Error parsing specific IE article: {inner_e}")
                        continue

        articles_data.sort(key=lambda x: x['date_obj'], reverse=True)

    except Exception as e:
        print(f"Error scraping Indian Express articles: {e}")
    return articles_data


async def scrape_full_article(url):
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(url, timeout=20) as response:
                if response.status != 200:
                    print(f"Failed to fetch article content from {url}: {response.status}")
                    return None
                content = await response.text()
                soup = BeautifulSoup(content, "html.parser")
                
                content_div = soup.find('div', id='pcl-full-content') 
                if not content_div: 
                    content_div = soup.find('article') or soup.find('main') or soup.find('div', class_=re.compile(r'content|article-body|story'))

                if not content_div:
                    print(f"Could not find main content container for {url}")
                    return f"<p>Content not found. Please visit <a href='{url}'>original article</a>.</p>"
                
                title_tag_full = soup.find(['h1', 'h2'], class_=re.compile(r'title|headline')) 
                if not title_tag_full: title_tag_full = soup.find('h1') 
                title_text_full = title_tag_full.get_text(strip=True) if title_tag_full else "Article"
                
                author_date_div = soup.find(['div','span'], class_=re.compile(r'editor|author|date|byline|meta')) 
                author_date_text_full = author_date_div.get_text(separator=" ", strip=True) if author_date_div else ""
                

                elements = content_div.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'blockquote', 'figure', 'table'])
                formatted_content_parts = []
                for elem in elements:
                    if elem.name in ['h1','h2','h3','h4','h5','h6']:
                        formatted_content_parts.append(f"<{elem.name}>{elem.get_text(strip=True)}</{elem.name}>")
                    elif elem.name == 'p':
                        
                        if elem.find_parent('li'): 
                            continue 
                        formatted_content_parts.append(f"<p>{elem.get_text(separator=' ', strip=True)}</p>")
                    elif elem.name in ['ul', 'ol']:
                        list_items_html = "".join([f"<li>{li_item.get_text(separator=' ', strip=True)}</li>" for li_item in elem.find_all('li', recursive=False)])
                        formatted_content_parts.append(f"<{elem.name}>{list_items_html}</{elem.name}>")
                    elif elem.name == 'blockquote':
                        formatted_content_parts.append(f"<blockquote>{elem.get_text(separator=' ', strip=True)}</blockquote>")
                    elif elem.name == 'figure':
                        img = elem.find('img')
                        caption = elem.find('figcaption')
                        if img and 'src' in img.attrs:
                            img_html = f"<img src='{img['src']}' alt='{img.get('alt','Image')}' style='max-width:100%; height:auto;'>"
                            if caption:
                                img_html += f"<figcaption>{caption.get_text(strip=True)}</figcaption>"
                            formatted_content_parts.append(f"<figure>{img_html}</figure>")
                    elif elem.name == 'table':
                        table_html = "<table>"
                        for tr in elem.find_all('tr'):
                            table_html += "<tr>"
                            for th_td in tr.find_all(['th', 'td']):
                                table_html += f"<{th_td.name}>{th_td.get_text(strip=True)}</{th_td.name}>"
                            table_html += "</tr>"
                        table_html += "</table>"
                        formatted_content_parts.append(table_html)


                full_article_html = f"<h1>{title_text_full}</h1>"
                if author_date_text_full:
                    full_article_html += f"<div class='article-meta' style='color:grey; margin-bottom:1em;'>{author_date_text_full}</div>"
                full_article_html += "\n".join(formatted_content_parts)
                
                return full_article_html
    except Exception as e:
        print(f"Error scraping full article from {url}: {e}")
        return f"<p>Error loading article content. Please visit <a href='{url}'>original article</a>.</p>"


@app.route('/article/<path:url>')
async def show_article(url):
    if not url.startswith('http'):
        print(f"Warning: URL '{url}' might be partial. Assuming it's complete.")

    full_content_html = await scrape_full_article(url)
    if full_content_html is None:
        full_content_html = f"<p>Failed to fetch article content for {url}.</p>"
    return render_template('full_article.html', content=full_content_html)


async def scrape_insights_articles():
    articles_data_insights = []
    try:
        url = 'https://www.insightsonindia.com/upsc-mains-answer-writing-2025-insights-ias/'
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }

        async with aiohttp.ClientSession(headers=headers) as session:
            async with session.get(url, timeout=20) as response:
                if response.status != 200:
                    print(f"Failed to fetch Insights Answer Writing links: {response.status}")
                    return []
                content = await response.text()
                soup = BeautifulSoup(content, 'html.parser')
                count = 0
                for div_block in soup.find_all('div', class_='list_div'):
                    if count >= 2: break 
                    
                    ul_tag = div_block.find('ul', class_='lcp_catlist')
                    if ul_tag:
                        for li in ul_tag.find_all('li'):
                            a_tag = li.find('a')
                            if a_tag and a_tag.get('href'):
                                title = a_tag.text.strip()
                                link = a_tag['href']
                                articles_data_insights.append({
                                    'title': title,
                                    'link': link
                                })
                        count += 1
                        
    except Exception as e:
        print(f"Error scraping Insights Answer Writing links: {e}")
    return articles_data_insights

async def scrape_full_article_insight(article_url):
    filtered_content_parts = []
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(article_url, timeout=20) as response:
                if response.status != 200:
                    print(f"Failed to fetch Insights article content from {article_url}: {response.status}")
                    return None 
                content = await response.text()
                soup = BeautifulSoup(content, 'html.parser')


                article_body = soup.find('div', class_=re.compile(r'entry-content|article-content|post-content'))
                if not article_body:
                    print(f"Insights: Could not find article body for {article_url}")
                    return [{'type': 'p', 'text': 'Article content not found.'}]

                current_section_text = None

                for tag_item in article_body.find_all(['h1','h2','h3', 'h4', 'p', 'ul', 'ol', 'blockquote', 'table']):
                    if tag_item.name in ['h1','h2','h3','h4']: 
                        current_section_text = tag_item.text.strip()
                        filtered_content_parts.append({'type': tag_item.name, 'text': current_section_text})
                    elif tag_item.name == 'p':
                   
                        filtered_content_parts.append({'type': 'p', 'text': tag_item.text.strip()})
                    elif tag_item.name in ['ul', 'ol']:
                        items = [li.get_text(strip=True) for li in tag_item.find_all('li')]
                        if items:
                            filtered_content_parts.append({'type': 'list', 'ordered': tag_item.name == 'ol', 'items': items})
                    elif tag_item.name == 'blockquote':
                        filtered_content_parts.append({'type': 'blockquote', 'text': tag_item.get_text(strip=True)})
                    elif tag_item.name == 'table':
                        rows = []
                        for tr in tag_item.find_all('tr'):
                            cells = [td.get_text(strip=True) for td in tr.find_all(['th', 'td'])]
                            rows.append(cells)
                        if rows:
                           filtered_content_parts.append({'type': 'table', 'rows': rows})


    except Exception as e:
        print(f"Error scraping full Insights article from {article_url}: {e}")
        return [{'type': 'p', 'text': f'Error loading article: {e}'}]
    return filtered_content_parts


@app.route('/article_insight/<path:url>')
async def show_article_insight(url):
    full_content_data = await scrape_full_article_insight(url) 
    if not full_content_data: 
        return "Failed to load the Insights article content.", 404
    return render_template('full_article_insight.html', content=full_content_data)


async def scrape_orf_articles():
    orf_articles_data = []
    try:
        url = 'https://www.orfonline.org/content-type/issue-briefs'
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        }

        async with aiohttp.ClientSession(headers=headers) as session:
            async with session.get(url, timeout=20) as response:
                if response.status != 200:
                    print(f"Failed to fetch ORF articles from {url}: {response.status}")
                    return []
                content = await response.text()
                soup = BeautifulSoup(content, 'html.parser')

                cutoff_orf = datetime.now(timezone.utc) - timedelta(days=45)

                potential_articles = soup.find_all('div', class_=re.compile(r'col-|card|item|listing|post'))
                
                for article_block in potential_articles:
                    title_tag = article_block.find(['h2', 'h3'])
                    if not title_tag: 
                        continue
                    
                    title_text = title_tag.get_text(strip=True)
                    if not title_text or len(title_text) < 10: 
                        continue

                   
                    link_tag = title_tag.find('a')
                    if not link_tag:
                       
                        link_tag = article_block.find('a')
                    
                    if not link_tag or not link_tag.get('href'):
                        continue
                        
                    doc_url = link_tag['href']
                    if not doc_url.startswith('http'):
                        doc_url = f"https://www.orfonline.org{doc_url}" if doc_url.startswith('/') else f"https://www.orfonline.org/{doc_url}"

                   
                    date_tag = article_block.find('time') or article_block.find(class_=re.compile(r'date|meta|time'))
                    article_date_obj = None
                    
                    if date_tag:
                        date_str = date_tag.get_text(strip=True)
                        try:
                            
                            article_date_obj = datetime.strptime(date_str, "%b %d, %Y").replace(tzinfo=timezone.utc)
                        except ValueError:
                            try:
                                article_date_obj = datetime.strptime(date_str, "%d %B %Y").replace(tzinfo=timezone.utc)
                            except ValueError:
                                pass 
                    
                   
                    if not article_date_obj:
                         article_date_obj = datetime.now(timezone.utc) 

                    if article_date_obj >= cutoff_orf:
                        
                        desc_tag = article_block.find('p')
                        desc_text = desc_tag.get_text(strip=True) if desc_tag else ""
                        
                       
                        if any(a['link'] == doc_url for a in orf_articles_data):
                            continue

                        orf_articles_data.append({
                            'title': title_text,
                            'link': doc_url,
                            'date_obj': article_date_obj,
                            'date': article_date_obj.strftime('%B %d, %Y'),
                            'description': desc_text,
                            'author': "ORF" 
                        })

    except Exception as e:
        print(f"Error scraping ORF articles: {e}")
    
    
    seen = set()
    unique_data = []
    for d in orf_articles_data:
        if d['link'] not in seen:
            seen.add(d['link'])
            unique_data.append(d)
            
    unique_data.sort(key=lambda x: x['date_obj'], reverse=True)
    return unique_data

async def scrape_forumias(url_path="7pm"):
    url = f"https://forumias.com/blog/{url_path}/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(url, timeout=15) as response:
            if response.status != 200:
                return []
            
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")
            
            sections_data = []
            articles_list = []
            date_groups = soup.find_all('div', class_='cat-archive-date-group')
            
            for group in date_groups:
                date_div = group.find('div', class_='post-date')
                date_text = date_div.get_text(" ", strip=True) if date_div else ""
                
                links = group.find_all('a')
                for a in links:
                    articles_list.append({
                        'title': a.get_text(strip=True),
                        'url': a.get('href'),
                        'date': date_text
                    })

            if articles_list:
                sections_data.append({
                    'section': f"ForumIAS {url_path.upper()} Editorials",
                    'articles': articles_list
                })
                
            return sections_data

async def scrape_forumias_combined():
    results = await asyncio.gather(
        scrape_forumias("7pm"),
        scrape_forumias("9pm")
    )
    return [item for sublist in results for item in sublist]


@app.route('/forumias')
async def forumias():
    scraped_sections = await scrape_forumias_combined()
    return render_template('forumias.html', sections=scraped_sections)

@app.route('/forumias/<section>')
async def forumias_section(section):
    if section not in ['7pm', '9pm']:
        return "Invalid section", 404
    scraped_sections = await scrape_forumias(section)
    return render_template('forumias.html', sections=scraped_sections)

@app.route('/TH_article/<path:url>')
async def show_th_article(url):
    article_content_data = await scrape_TH_learning(url) 
    if not article_content_data: 
        return "Failed to load The Hindu Learning Corner article content.", 404
    return render_template('article_content.html', content=article_content_data)

async def scrape_TH_learning(article_url_th):
    article_content_th = []
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(article_url_th, timeout=20) as response:
                if response.status != 200:
                    print(f"Failed to fetch TH Learning article from {article_url_th}: {response.status}")
                    return None
                content = await response.text()
                soup = BeautifulSoup(content, "html.parser")

     
                main_content_area = soup.find('div', class_=re.compile(r'articlebody|content|story-body'))
                if not main_content_area:
                     main_content_area = soup

                for tag_item_th in main_content_area.find_all(['h1','h2','h3', 'h4', 'p', 'ul', 'ol']):
                    if tag_item_th.name in ['h1','h2','h3','h4']:
                        article_content_th.append({'type': tag_item_th.name, 'text': tag_item_th.text.strip()})
                    elif tag_item_th.name == 'p':
                        article_content_th.append({'type': 'p', 'text': tag_item_th.text.strip()})
                    elif tag_item_th.name in ['ul', 'ol']:
                        items = [li.get_text(strip=True) for li in tag_item_th.find_all('li')]
                        if items:
                             article_content_th.append({'type':'list', 'ordered': tag_item_th.name=='ol', 'items':items})
                
                if not article_content_th and main_content_area == soup:
                    print(f"TH Learning: No specific content tags found on {article_url_th}, page might be structured differently.")

    except Exception as e:
        print(f"Error scraping TH Learning article from {article_url_th}: {e}")
        return None 
    return article_content_th


if __name__ == '__main__':
    app.run(debug=True)