Writo commited on
Commit
8d5d86a
·
1 Parent(s): 0894705

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -1
app.py CHANGED
@@ -85,11 +85,17 @@ def get_pdf_links_from_dataset(url):
85
  response = requests.get(url)
86
  response.raise_for_status()
87
  soup = BeautifulSoup(response.text, 'html.parser')
88
- pdf_links = [link.get('href') for link in soup.find_all('a') if '.pdf' in link.get('href')]
 
 
 
 
 
89
  return pdf_links
90
  except requests.HTTPError as e:
91
  logging.error(f"Failed to get PDF links from dataset. Error: {e}")
92
  return []
 
93
 
94
  #train_directory = r'C:\Users\writa\Downloads\Crypto'
95
  url = "https://huggingface.co/datasets/Writo/realestate_data/tree/main"
 
85
  response = requests.get(url)
86
  response.raise_for_status()
87
  soup = BeautifulSoup(response.text, 'html.parser')
88
+
89
+ # Define the base URL
90
+ base_url = "https://huggingface.co"
91
+
92
+ # Extract and construct absolute URLs
93
+ pdf_links = [base_url + link.get('href') for link in soup.find_all('a') if '.pdf' in link.get('href')]
94
  return pdf_links
95
  except requests.HTTPError as e:
96
  logging.error(f"Failed to get PDF links from dataset. Error: {e}")
97
  return []
98
+
99
 
100
  #train_directory = r'C:\Users\writa\Downloads\Crypto'
101
  url = "https://huggingface.co/datasets/Writo/realestate_data/tree/main"