import requests from bs4 import BeautifulSoup from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM import gradio as gr from tqdm import tqdm from termcolor import colored import random import re import concurrent.futures def get_random_color(): colors = ["red", "green", "yellow", "blue", "magenta", "cyan", "white"] return random.choice(colors) def print_colored(text): color = get_random_color() print(colored(text, color)) def scrape_website(url): try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") text = " ".join(p.get_text() for p in soup.find_all(["p", "div", "span"])) return text, soup except requests.RequestException as e: print_colored(f"Error scraping {url}: {e}") return "", None def clean_text(text): cleaned_text = " ".join(text.split()) return cleaned_text def extract_links(soup, base_url): links = [] if soup: for a in soup.find_all('a', href=True): link = a['href'] if not link.startswith('http'): link = base_url + link if base_url in link and link not in links: links.append(link) return links def process_data(text): summarizer = pipeline("summarization", model="facebook/bart-large-cnn") max_input_length = 1024 # BART-Large-CNN handles up to 1024 tokens inputs = text[:max_input_length] # Truncate to the max input length summary = summarizer(inputs, max_length=300, min_length=200, do_sample=True) return summary[0]['summary_text'] def generate_creative_text(summary1, summary2, summary3): tokenizer = AutoTokenizer.from_pretrained("gpt2-xl") model = AutoModelForCausalLM.from_pretrained("gpt2-xl") input_text = f"Create a poem or short story about Ling and Sanny Lin, combining the following three summaries:\n\nSummary 1: {summary1}\n\nSummary 2: {summary2}\n\nSummary 3: {summary3}\n\nPoem/Short Story:" inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True) outputs = model.generate( inputs["input_ids"], max_length=300, num_return_sequences=1, repetition_penalty=1.2, pad_token_id=tokenizer.eos_token_id, attention_mask=inputs["attention_mask"] ) creative_text = tokenizer.decode(outputs[0], skip_special_tokens=True) creative_text = re.sub(r'\s+', ' ', creative_text).strip() return creative_text def scrape_and_process_site(base_url, depth=1, max_depth=2): print_colored(f"Scraping data from {base_url} (depth: {depth})...") main_text, soup = scrape_website(base_url) main_text = clean_text(main_text) all_text = [main_text] if depth < max_depth: links = extract_links(soup, base_url) for link in tqdm(links, desc=f"Scraping links from {base_url}"): link_text, _ = scrape_website(link) link_text = clean_text(link_text) if link_text: all_text.append(link_text) # Recursive call to scrape links deeper sub_links_text = scrape_and_process_site(link, depth + 1, max_depth) all_text.extend(sub_links_text) return all_text def scrape_and_process(): base_urls = ["https://taellinglin.art", "https://sannylin.me", "https://linglin.fun"] results = {} with concurrent.futures.ThreadPoolExecutor() as executor: future_to_url = {executor.submit(scrape_and_process_site, url): url for url in base_urls} for future in concurrent.futures.as_completed(future_to_url): url = future_to_url[future] try: data = future.result() results[url] = " ".join(data) except Exception as exc: print_colored(f"{url} generated an exception: {exc}") summary_taellinglin = results.get("https://taellinglin.art", "No data found.") summary_sannylin = results.get("https://sannylin.me", "No data found.") summary_linglin = results.get("https://linglin.fun", "No data found.") creative_text = generate_creative_text(summary_taellinglin, summary_sannylin, summary_linglin) return (f"
{summary_taellinglin}
", f"{summary_sannylin}
", f"{summary_linglin}
", f"{creative_text}
") def main(): iface = gr.Interface(fn=scrape_and_process, inputs=[], outputs=["html", "html", "html", "html"]) iface.launch(server_name="0.0.0.0") if __name__ == "__main__": main()