Spaces:

LingFeiFei
/

AboutUs

Sleeping

App Files Files Community

taellinglin commited on Jul 20, 2024

Commit

3cb8b19

verified ·

1 Parent(s): 4f502d1

Create app.py

Browse files

Files changed (1) hide show

app.py +124 -0

app.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import requests
+from bs4 import BeautifulSoup
+from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
+import gradio as gr
+from tqdm import tqdm
+from termcolor import colored
+import random
+import re
+import concurrent.futures
+def get_random_color():
+    colors = ["red", "green", "yellow", "blue", "magenta", "cyan", "white"]
+    return random.choice(colors)
+def print_colored(text):
+    color = get_random_color()
+    print(colored(text, color))
+def scrape_website(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, "html.parser")
+        text = " ".join(p.get_text() for p in soup.find_all(["p", "div", "span"]))
+        return text, soup
+    except requests.RequestException as e:
+        print_colored(f"Error scraping {url}: {e}")
+        return "", None
+def clean_text(text):
+    cleaned_text = " ".join(text.split())
+    return cleaned_text
+def extract_links(soup, base_url):
+    links = []
+    if soup:
+        for a in soup.find_all('a', href=True):
+            link = a['href']
+            if not link.startswith('http'):
+                link = base_url + link
+            if base_url in link and link not in links:
+                links.append(link)
+    return links
+def process_data(text):
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+    max_input_length = 1024  # BART-Large-CNN handles up to 1024 tokens
+    inputs = text[:max_input_length]  # Truncate to the max input length
+    summary = summarizer(inputs, max_length=300, min_length=200, do_sample=True)
+    return summary[0]['summary_text']
+def generate_creative_text(summary1, summary2, summary3):
+    tokenizer = AutoTokenizer.from_pretrained("gpt2-xl")
+    model = AutoModelForCausalLM.from_pretrained("gpt2-xl")
+    input_text = f"Create a poem or short story about Ling and Sanny Lin, combining the following three summaries:\n\nSummary 1: {summary1}\n\nSummary 2: {summary2}\n\nSummary 3: {summary3}\n\nPoem/Short Story:"
+    inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
+    outputs = model.generate(
+        inputs["input_ids"],
+        max_length=300,
+        num_return_sequences=1,
+        repetition_penalty=1.2,
+        pad_token_id=tokenizer.eos_token_id,
+        attention_mask=inputs["attention_mask"]
+    )
+    creative_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    creative_text = re.sub(r'\s+', ' ', creative_text).strip()
+    return creative_text
+def scrape_and_process_site(base_url, depth=1, max_depth=2):
+    print_colored(f"Scraping data from {base_url} (depth: {depth})...")
+    main_text, soup = scrape_website(base_url)
+    main_text = clean_text(main_text)
+    all_text = [main_text]
+    if depth < max_depth:
+        links = extract_links(soup, base_url)
+        for link in tqdm(links, desc=f"Scraping links from {base_url}"):
+            link_text, _ = scrape_website(link)
+            link_text = clean_text(link_text)
+            if link_text:
+                all_text.append(link_text)
+                # Recursive call to scrape links deeper
+                sub_links_text = scrape_and_process_site(link, depth + 1, max_depth)
+                all_text.extend(sub_links_text)
+    return all_text
+def scrape_and_process():
+    base_urls = ["https://taellinglin.art", "https://sannylin.me", "https://linglin.fun"]
+    results = {}
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        future_to_url = {executor.submit(scrape_and_process_site, url): url for url in base_urls}
+        for future in concurrent.futures.as_completed(future_to_url):
+            url = future_to_url[future]
+            try:
+                data = future.result()
+                results[url] = " ".join(data)
+            except Exception as exc:
+                print_colored(f"{url} generated an exception: {exc}")
+    summary_taellinglin = results.get("https://taellinglin.art", "No data found.")
+    summary_sannylin = results.get("https://sannylin.me", "No data found.")
+    summary_linglin = results.get("https://linglin.fun", "No data found.")
+    creative_text = generate_creative_text(summary_taellinglin, summary_sannylin, summary_linglin)
+    return (f"<p style='color: {get_random_color()};'>{summary_taellinglin}</p>",
+            f"<p style='color: {get_random_color()};'>{summary_sannylin}</p>",
+            f"<p style='color: {get_random_color()};'>{summary_linglin}</p>",
+            f"<p style='color: {get_random_color()};'>{creative_text}</p>")
+def main():
+    iface = gr.Interface(fn=scrape_and_process,
+                         inputs=[],
+                         outputs=["html", "html", "html", "html"])
+    iface.launch(server_name="0.0.0.0", server_port=8686)
+if __name__ == "__main__":
+    main()