AboutUs / app.py
taellinglin's picture
Update app.py
90b8b42 verified
import requests
from bs4 import BeautifulSoup
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import gradio as gr
from tqdm import tqdm
from termcolor import colored
import random
import re
import concurrent.futures
def get_random_color():
colors = ["red", "green", "yellow", "blue", "magenta", "cyan", "white"]
return random.choice(colors)
def print_colored(text):
color = get_random_color()
print(colored(text, color))
def scrape_website(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
text = " ".join(p.get_text() for p in soup.find_all(["p", "div", "span"]))
return text, soup
except requests.RequestException as e:
print_colored(f"Error scraping {url}: {e}")
return "", None
def clean_text(text):
cleaned_text = " ".join(text.split())
return cleaned_text
def extract_links(soup, base_url):
links = []
if soup:
for a in soup.find_all('a', href=True):
link = a['href']
if not link.startswith('http'):
link = base_url + link
if base_url in link and link not in links:
links.append(link)
return links
def process_data(text):
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
max_input_length = 1024 # BART-Large-CNN handles up to 1024 tokens
inputs = text[:max_input_length] # Truncate to the max input length
summary = summarizer(inputs, max_length=300, min_length=200, do_sample=True)
return summary[0]['summary_text']
def generate_creative_text(summary1, summary2, summary3):
tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
input_text = f"Create a poem or short story about Ling and Sanny Lin, combining the following three summaries:\n\nSummary 1: {summary1}\n\nSummary 2: {summary2}\n\nSummary 3: {summary3}\n\nPoem/Short Story:"
inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
outputs = model.generate(
inputs["input_ids"],
max_length=300,
num_return_sequences=1,
repetition_penalty=1.2,
pad_token_id=tokenizer.eos_token_id,
attention_mask=inputs["attention_mask"]
)
creative_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
creative_text = re.sub(r'\s+', ' ', creative_text).strip()
return creative_text
def scrape_and_process_site(base_url, depth=1, max_depth=2):
print_colored(f"Scraping data from {base_url} (depth: {depth})...")
main_text, soup = scrape_website(base_url)
main_text = clean_text(main_text)
all_text = [main_text]
if depth < max_depth:
links = extract_links(soup, base_url)
for link in tqdm(links, desc=f"Scraping links from {base_url}"):
link_text, _ = scrape_website(link)
link_text = clean_text(link_text)
if link_text:
all_text.append(link_text)
# Recursive call to scrape links deeper
sub_links_text = scrape_and_process_site(link, depth + 1, max_depth)
all_text.extend(sub_links_text)
return all_text
def scrape_and_process():
base_urls = ["https://taellinglin.art", "https://sannylin.me", "https://linglin.fun"]
results = {}
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_url = {executor.submit(scrape_and_process_site, url): url for url in base_urls}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
results[url] = " ".join(data)
except Exception as exc:
print_colored(f"{url} generated an exception: {exc}")
summary_taellinglin = results.get("https://taellinglin.art", "No data found.")
summary_sannylin = results.get("https://sannylin.me", "No data found.")
summary_linglin = results.get("https://linglin.fun", "No data found.")
creative_text = generate_creative_text(summary_taellinglin, summary_sannylin, summary_linglin)
return (f"<p style='color: {get_random_color()};'>{summary_taellinglin}</p>",
f"<p style='color: {get_random_color()};'>{summary_sannylin}</p>",
f"<p style='color: {get_random_color()};'>{summary_linglin}</p>",
f"<p style='color: {get_random_color()};'>{creative_text}</p>")
def main():
iface = gr.Interface(fn=scrape_and_process,
inputs=[],
outputs=["html", "html", "html", "html"])
iface.launch(server_name="0.0.0.0")
if __name__ == "__main__":
main()