Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM | |
| import gradio as gr | |
| from tqdm import tqdm | |
| from termcolor import colored | |
| import random | |
| import re | |
| import concurrent.futures | |
| def get_random_color(): | |
| colors = ["red", "green", "yellow", "blue", "magenta", "cyan", "white"] | |
| return random.choice(colors) | |
| def print_colored(text): | |
| color = get_random_color() | |
| print(colored(text, color)) | |
| def scrape_website(url): | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| text = " ".join(p.get_text() for p in soup.find_all(["p", "div", "span"])) | |
| return text, soup | |
| except requests.RequestException as e: | |
| print_colored(f"Error scraping {url}: {e}") | |
| return "", None | |
| def clean_text(text): | |
| cleaned_text = " ".join(text.split()) | |
| return cleaned_text | |
| def extract_links(soup, base_url): | |
| links = [] | |
| if soup: | |
| for a in soup.find_all('a', href=True): | |
| link = a['href'] | |
| if not link.startswith('http'): | |
| link = base_url + link | |
| if base_url in link and link not in links: | |
| links.append(link) | |
| return links | |
| def process_data(text): | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| max_input_length = 1024 # BART-Large-CNN handles up to 1024 tokens | |
| inputs = text[:max_input_length] # Truncate to the max input length | |
| summary = summarizer(inputs, max_length=300, min_length=200, do_sample=True) | |
| return summary[0]['summary_text'] | |
| def generate_creative_text(summary1, summary2, summary3): | |
| tokenizer = AutoTokenizer.from_pretrained("gpt2-xl") | |
| model = AutoModelForCausalLM.from_pretrained("gpt2-xl") | |
| input_text = f"Create a poem or short story about Ling and Sanny Lin, combining the following three summaries:\n\nSummary 1: {summary1}\n\nSummary 2: {summary2}\n\nSummary 3: {summary3}\n\nPoem/Short Story:" | |
| inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True) | |
| outputs = model.generate( | |
| inputs["input_ids"], | |
| max_length=300, | |
| num_return_sequences=1, | |
| repetition_penalty=1.2, | |
| pad_token_id=tokenizer.eos_token_id, | |
| attention_mask=inputs["attention_mask"] | |
| ) | |
| creative_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| creative_text = re.sub(r'\s+', ' ', creative_text).strip() | |
| return creative_text | |
| def scrape_and_process_site(base_url, depth=1, max_depth=2): | |
| print_colored(f"Scraping data from {base_url} (depth: {depth})...") | |
| main_text, soup = scrape_website(base_url) | |
| main_text = clean_text(main_text) | |
| all_text = [main_text] | |
| if depth < max_depth: | |
| links = extract_links(soup, base_url) | |
| for link in tqdm(links, desc=f"Scraping links from {base_url}"): | |
| link_text, _ = scrape_website(link) | |
| link_text = clean_text(link_text) | |
| if link_text: | |
| all_text.append(link_text) | |
| # Recursive call to scrape links deeper | |
| sub_links_text = scrape_and_process_site(link, depth + 1, max_depth) | |
| all_text.extend(sub_links_text) | |
| return all_text | |
| def scrape_and_process(): | |
| base_urls = ["https://taellinglin.art", "https://sannylin.me", "https://linglin.fun"] | |
| results = {} | |
| with concurrent.futures.ThreadPoolExecutor() as executor: | |
| future_to_url = {executor.submit(scrape_and_process_site, url): url for url in base_urls} | |
| for future in concurrent.futures.as_completed(future_to_url): | |
| url = future_to_url[future] | |
| try: | |
| data = future.result() | |
| results[url] = " ".join(data) | |
| except Exception as exc: | |
| print_colored(f"{url} generated an exception: {exc}") | |
| summary_taellinglin = results.get("https://taellinglin.art", "No data found.") | |
| summary_sannylin = results.get("https://sannylin.me", "No data found.") | |
| summary_linglin = results.get("https://linglin.fun", "No data found.") | |
| creative_text = generate_creative_text(summary_taellinglin, summary_sannylin, summary_linglin) | |
| return (f"<p style='color: {get_random_color()};'>{summary_taellinglin}</p>", | |
| f"<p style='color: {get_random_color()};'>{summary_sannylin}</p>", | |
| f"<p style='color: {get_random_color()};'>{summary_linglin}</p>", | |
| f"<p style='color: {get_random_color()};'>{creative_text}</p>") | |
| def main(): | |
| iface = gr.Interface(fn=scrape_and_process, | |
| inputs=[], | |
| outputs=["html", "html", "html", "html"]) | |
| iface.launch(server_name="0.0.0.0") | |
| if __name__ == "__main__": | |
| main() | |