Spaces:

LingFeiFei
/

AboutUs

Sleeping

App Files Files Community

AboutUs / app.py

taellinglin

Update app.py

90b8b42 verified over 1 year ago

raw

history blame contribute delete

4.84 kB

	import requests
	from bs4 import BeautifulSoup
	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	import gradio as gr
	from tqdm import tqdm
	from termcolor import colored
	import random
	import re
	import concurrent.futures

	def get_random_color():
	colors = ["red", "green", "yellow", "blue", "magenta", "cyan", "white"]
	return random.choice(colors)

	def print_colored(text):
	color = get_random_color()
	print(colored(text, color))

	def scrape_website(url):
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, "html.parser")
	text = " ".join(p.get_text() for p in soup.find_all(["p", "div", "span"]))
	return text, soup
	except requests.RequestException as e:
	print_colored(f"Error scraping {url}: {e}")
	return "", None

	def clean_text(text):
	cleaned_text = " ".join(text.split())
	return cleaned_text

	def extract_links(soup, base_url):
	links = []
	if soup:
	for a in soup.find_all('a', href=True):
	link = a['href']
	if not link.startswith('http'):
	link = base_url + link
	if base_url in link and link not in links:
	links.append(link)
	return links

	def process_data(text):
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
	max_input_length = 1024 # BART-Large-CNN handles up to 1024 tokens
	inputs = text[:max_input_length] # Truncate to the max input length
	summary = summarizer(inputs, max_length=300, min_length=200, do_sample=True)
	return summary[0]['summary_text']

	def generate_creative_text(summary1, summary2, summary3):
	tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
	model = AutoModelForCausalLM.from_pretrained("gpt2-xl")

	input_text = f"Create a poem or short story about Ling and Sanny Lin, combining the following three summaries:\n\nSummary 1: {summary1}\n\nSummary 2: {summary2}\n\nSummary 3: {summary3}\n\nPoem/Short Story:"
	inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
	outputs = model.generate(
	inputs["input_ids"],
	max_length=300,
	num_return_sequences=1,
	repetition_penalty=1.2,
	pad_token_id=tokenizer.eos_token_id,
	attention_mask=inputs["attention_mask"]
	)

	creative_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	creative_text = re.sub(r'\s+', ' ', creative_text).strip()

	return creative_text

	def scrape_and_process_site(base_url, depth=1, max_depth=2):
	print_colored(f"Scraping data from {base_url} (depth: {depth})...")
	main_text, soup = scrape_website(base_url)
	main_text = clean_text(main_text)
	all_text = [main_text]

	if depth < max_depth:
	links = extract_links(soup, base_url)

	for link in tqdm(links, desc=f"Scraping links from {base_url}"):
	link_text, _ = scrape_website(link)
	link_text = clean_text(link_text)
	if link_text:
	all_text.append(link_text)
	# Recursive call to scrape links deeper
	sub_links_text = scrape_and_process_site(link, depth + 1, max_depth)
	all_text.extend(sub_links_text)

	return all_text

	def scrape_and_process():
	base_urls = ["https://taellinglin.art", "https://sannylin.me", "https://linglin.fun"]
	results = {}

	with concurrent.futures.ThreadPoolExecutor() as executor:
	future_to_url = {executor.submit(scrape_and_process_site, url): url for url in base_urls}
	for future in concurrent.futures.as_completed(future_to_url):
	url = future_to_url[future]
	try:
	data = future.result()
	results[url] = " ".join(data)
	except Exception as exc:
	print_colored(f"{url} generated an exception: {exc}")

	summary_taellinglin = results.get("https://taellinglin.art", "No data found.")
	summary_sannylin = results.get("https://sannylin.me", "No data found.")
	summary_linglin = results.get("https://linglin.fun", "No data found.")

	creative_text = generate_creative_text(summary_taellinglin, summary_sannylin, summary_linglin)

	return (f"<p style='color: {get_random_color()};'>{summary_taellinglin}</p>",
	f"<p style='color: {get_random_color()};'>{summary_sannylin}</p>",
	f"<p style='color: {get_random_color()};'>{summary_linglin}</p>",
	f"<p style='color: {get_random_color()};'>{creative_text}</p>")

	def main():
	iface = gr.Interface(fn=scrape_and_process,
	inputs=[],
	outputs=["html", "html", "html", "html"])
	iface.launch(server_name="0.0.0.0")

	if __name__ == "__main__":
	main()