Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -85,11 +85,17 @@ def get_pdf_links_from_dataset(url):
|
|
| 85 |
response = requests.get(url)
|
| 86 |
response.raise_for_status()
|
| 87 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
return pdf_links
|
| 90 |
except requests.HTTPError as e:
|
| 91 |
logging.error(f"Failed to get PDF links from dataset. Error: {e}")
|
| 92 |
return []
|
|
|
|
| 93 |
|
| 94 |
#train_directory = r'C:\Users\writa\Downloads\Crypto'
|
| 95 |
url = "https://huggingface.co/datasets/Writo/realestate_data/tree/main"
|
|
|
|
| 85 |
response = requests.get(url)
|
| 86 |
response.raise_for_status()
|
| 87 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 88 |
+
|
| 89 |
+
# Define the base URL
|
| 90 |
+
base_url = "https://huggingface.co"
|
| 91 |
+
|
| 92 |
+
# Extract and construct absolute URLs
|
| 93 |
+
pdf_links = [base_url + link.get('href') for link in soup.find_all('a') if '.pdf' in link.get('href')]
|
| 94 |
return pdf_links
|
| 95 |
except requests.HTTPError as e:
|
| 96 |
logging.error(f"Failed to get PDF links from dataset. Error: {e}")
|
| 97 |
return []
|
| 98 |
+
|
| 99 |
|
| 100 |
#train_directory = r'C:\Users\writa\Downloads\Crypto'
|
| 101 |
url = "https://huggingface.co/datasets/Writo/realestate_data/tree/main"
|