taellinglin commited on
Commit
3cb8b19
·
verified ·
1 Parent(s): 4f502d1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -0
app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
4
+ import gradio as gr
5
+ from tqdm import tqdm
6
+ from termcolor import colored
7
+ import random
8
+ import re
9
+ import concurrent.futures
10
+
11
+ def get_random_color():
12
+ colors = ["red", "green", "yellow", "blue", "magenta", "cyan", "white"]
13
+ return random.choice(colors)
14
+
15
+ def print_colored(text):
16
+ color = get_random_color()
17
+ print(colored(text, color))
18
+
19
+ def scrape_website(url):
20
+ try:
21
+ response = requests.get(url)
22
+ response.raise_for_status()
23
+ soup = BeautifulSoup(response.content, "html.parser")
24
+ text = " ".join(p.get_text() for p in soup.find_all(["p", "div", "span"]))
25
+ return text, soup
26
+ except requests.RequestException as e:
27
+ print_colored(f"Error scraping {url}: {e}")
28
+ return "", None
29
+
30
+ def clean_text(text):
31
+ cleaned_text = " ".join(text.split())
32
+ return cleaned_text
33
+
34
+ def extract_links(soup, base_url):
35
+ links = []
36
+ if soup:
37
+ for a in soup.find_all('a', href=True):
38
+ link = a['href']
39
+ if not link.startswith('http'):
40
+ link = base_url + link
41
+ if base_url in link and link not in links:
42
+ links.append(link)
43
+ return links
44
+
45
+ def process_data(text):
46
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
47
+ max_input_length = 1024 # BART-Large-CNN handles up to 1024 tokens
48
+ inputs = text[:max_input_length] # Truncate to the max input length
49
+ summary = summarizer(inputs, max_length=300, min_length=200, do_sample=True)
50
+ return summary[0]['summary_text']
51
+
52
+ def generate_creative_text(summary1, summary2, summary3):
53
+ tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
54
+ model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
55
+
56
+ input_text = f"Create a poem or short story about Ling and Sanny Lin, combining the following three summaries:\n\nSummary 1: {summary1}\n\nSummary 2: {summary2}\n\nSummary 3: {summary3}\n\nPoem/Short Story:"
57
+ inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
58
+ outputs = model.generate(
59
+ inputs["input_ids"],
60
+ max_length=300,
61
+ num_return_sequences=1,
62
+ repetition_penalty=1.2,
63
+ pad_token_id=tokenizer.eos_token_id,
64
+ attention_mask=inputs["attention_mask"]
65
+ )
66
+
67
+ creative_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
68
+ creative_text = re.sub(r'\s+', ' ', creative_text).strip()
69
+
70
+ return creative_text
71
+
72
+ def scrape_and_process_site(base_url, depth=1, max_depth=2):
73
+ print_colored(f"Scraping data from {base_url} (depth: {depth})...")
74
+ main_text, soup = scrape_website(base_url)
75
+ main_text = clean_text(main_text)
76
+ all_text = [main_text]
77
+
78
+ if depth < max_depth:
79
+ links = extract_links(soup, base_url)
80
+
81
+ for link in tqdm(links, desc=f"Scraping links from {base_url}"):
82
+ link_text, _ = scrape_website(link)
83
+ link_text = clean_text(link_text)
84
+ if link_text:
85
+ all_text.append(link_text)
86
+ # Recursive call to scrape links deeper
87
+ sub_links_text = scrape_and_process_site(link, depth + 1, max_depth)
88
+ all_text.extend(sub_links_text)
89
+
90
+ return all_text
91
+
92
+ def scrape_and_process():
93
+ base_urls = ["https://taellinglin.art", "https://sannylin.me", "https://linglin.fun"]
94
+ results = {}
95
+
96
+ with concurrent.futures.ThreadPoolExecutor() as executor:
97
+ future_to_url = {executor.submit(scrape_and_process_site, url): url for url in base_urls}
98
+ for future in concurrent.futures.as_completed(future_to_url):
99
+ url = future_to_url[future]
100
+ try:
101
+ data = future.result()
102
+ results[url] = " ".join(data)
103
+ except Exception as exc:
104
+ print_colored(f"{url} generated an exception: {exc}")
105
+
106
+ summary_taellinglin = results.get("https://taellinglin.art", "No data found.")
107
+ summary_sannylin = results.get("https://sannylin.me", "No data found.")
108
+ summary_linglin = results.get("https://linglin.fun", "No data found.")
109
+
110
+ creative_text = generate_creative_text(summary_taellinglin, summary_sannylin, summary_linglin)
111
+
112
+ return (f"<p style='color: {get_random_color()};'>{summary_taellinglin}</p>",
113
+ f"<p style='color: {get_random_color()};'>{summary_sannylin}</p>",
114
+ f"<p style='color: {get_random_color()};'>{summary_linglin}</p>",
115
+ f"<p style='color: {get_random_color()};'>{creative_text}</p>")
116
+
117
+ def main():
118
+ iface = gr.Interface(fn=scrape_and_process,
119
+ inputs=[],
120
+ outputs=["html", "html", "html", "html"])
121
+ iface.launch(server_name="0.0.0.0", server_port=8686)
122
+
123
+ if __name__ == "__main__":
124
+ main()