Spaces:

leetuan023
/

pack

Sleeping

App Files Files Community

leetuan023 commited on Aug 17, 2024

Commit

6b8a953

verified ·

1 Parent(s): 8d892b3

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -2

app.py CHANGED Viewed

@@ -1,3 +1,58 @@
-from diffusers import DiffusionPipeline
-pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt")

+import requests
+from bs4 import BeautifulSoup
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+# Set up the Hugging Face model and tokenizer for text extraction
+model_name = "distilbert-base-uncased"
+model = AutoModelForTokenClassification.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+def scrape_website(url):
+    # Send an HTTP request to the website
+    response = requests.get(url)
+    # Parse the HTML content using BeautifulSoup
+    soup = BeautifulSoup(response.content, 'html.parser')
+    # Extract the text content from the HTML
+    text = soup.get_text()
+    # Preprocess the text using the Hugging Face tokenizer
+    inputs = tokenizer.encode_plus(
+        text,
+        add_special_tokens=True,
+        max_length=512,
+        return_attention_mask=True,
+        return_tensors='pt'
+    )
+    # Use the Hugging Face model to extract the content
+    outputs = model(**inputs)
+    content = outputs.last_hidden_state[:, 0, :]
+    # Convert the content to a string
+    content_str = tokenizer.decode(content, skip_special_tokens=True)
+    return content_str
+# Define a function to scrape multiple URLs
+def scrape_multiple_websites(urls):
+    contents = []
+    for url in urls:
+        content = scrape_website(url)
+        contents.append(content)
+    # Join the contents of multiple URLs
+    joined_content = '\n\n'.join(contents)
+    return joined_content
+# Example usage: Scrape a single URL
+url = "https://www.example.com"
+content = scrape_website(url)
+print(content)
+# Example usage: Scrape multiple URLs
+urls = ["https://www.example.com", "https://www.example2.com"]
+content = scrape_multiple_websites(urls)
+print(content)