import requests
from bs4 import BeautifulSoup
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import gradio as gr
from tqdm import tqdm
from termcolor import colored
import random
import re
import concurrent.futures

def get_random_color():
    colors = ["red", "green", "yellow", "blue", "magenta", "cyan", "white"]
    return random.choice(colors)

def print_colored(text):
    color = get_random_color()
    print(colored(text, color))

def scrape_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        text = " ".join(p.get_text() for p in soup.find_all(["p", "div", "span"]))
        return text, soup
    except requests.RequestException as e:
        print_colored(f"Error scraping {url}: {e}")
        return "", None

def clean_text(text):
    cleaned_text = " ".join(text.split())
    return cleaned_text

def extract_links(soup, base_url):
    links = []
    if soup:
        for a in soup.find_all('a', href=True):
            link = a['href']
            if not link.startswith('http'):
                link = base_url + link
            if base_url in link and link not in links:
                links.append(link)
    return links

def process_data(text):
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    max_input_length = 1024  # BART-Large-CNN handles up to 1024 tokens
    inputs = text[:max_input_length]  # Truncate to the max input length
    summary = summarizer(inputs, max_length=300, min_length=200, do_sample=True)
    return summary[0]['summary_text']

def generate_creative_text(summary1, summary2, summary3):
    tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
    model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
    
    input_text = f"Create a poem or short story about Ling and Sanny Lin, combining the following three summaries:\n\nSummary 1: {summary1}\n\nSummary 2: {summary2}\n\nSummary 3: {summary3}\n\nPoem/Short Story:"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
    outputs = model.generate(
        inputs["input_ids"],
        max_length=300,
        num_return_sequences=1,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id,
        attention_mask=inputs["attention_mask"]
    )
    
    creative_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    creative_text = re.sub(r'\s+', ' ', creative_text).strip()
    
    return creative_text

def scrape_and_process_site(base_url, depth=1, max_depth=2):
    print_colored(f"Scraping data from {base_url} (depth: {depth})...")
    main_text, soup = scrape_website(base_url)
    main_text = clean_text(main_text)
    all_text = [main_text]

    if depth < max_depth:
        links = extract_links(soup, base_url)
        
        for link in tqdm(links, desc=f"Scraping links from {base_url}"):
            link_text, _ = scrape_website(link)
            link_text = clean_text(link_text)
            if link_text:
                all_text.append(link_text)
                # Recursive call to scrape links deeper
                sub_links_text = scrape_and_process_site(link, depth + 1, max_depth)
                all_text.extend(sub_links_text)

    return all_text

def scrape_and_process():
    base_urls = ["https://taellinglin.art", "https://sannylin.me", "https://linglin.fun"]
    results = {}

    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_url = {executor.submit(scrape_and_process_site, url): url for url in base_urls}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                data = future.result()
                results[url] = " ".join(data)
            except Exception as exc:
                print_colored(f"{url} generated an exception: {exc}")

    summary_taellinglin = results.get("https://taellinglin.art", "No data found.")
    summary_sannylin = results.get("https://sannylin.me", "No data found.")
    summary_linglin = results.get("https://linglin.fun", "No data found.")
    
    creative_text = generate_creative_text(summary_taellinglin, summary_sannylin, summary_linglin)

    return (f"<p style='color: {get_random_color()};'>{summary_taellinglin}</p>", 
            f"<p style='color: {get_random_color()};'>{summary_sannylin}</p>", 
            f"<p style='color: {get_random_color()};'>{summary_linglin}</p>",
            f"<p style='color: {get_random_color()};'>{creative_text}</p>")

def main():
    iface = gr.Interface(fn=scrape_and_process, 
                         inputs=[], 
                         outputs=["html", "html", "html", "html"])
    iface.launch(server_name="0.0.0.0")

if __name__ == "__main__":
    main()