Spaces:

Writo
/

EstateSphere

Sleeping

Writo commited on Jan 6, 2024

Commit

8d5d86a

1 Parent(s): 0894705

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -85,11 +85,17 @@ def get_pdf_links_from_dataset(url):
         response = requests.get(url)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
-        pdf_links = [link.get('href') for link in soup.find_all('a') if '.pdf' in link.get('href')]
         return pdf_links
     except requests.HTTPError as e:
         logging.error(f"Failed to get PDF links from dataset. Error: {e}")
         return []
 #train_directory = r'C:\Users\writa\Downloads\Crypto'
 url = "https://huggingface.co/datasets/Writo/realestate_data/tree/main"

         response = requests.get(url)
         response.raise_for_status()
         soup = BeautifulSoup(response.text, 'html.parser')
+        # Define the base URL
+        base_url = "https://huggingface.co"
+        # Extract and construct absolute URLs
+        pdf_links = [base_url + link.get('href') for link in soup.find_all('a') if '.pdf' in link.get('href')]
         return pdf_links
     except requests.HTTPError as e:
         logging.error(f"Failed to get PDF links from dataset. Error: {e}")
         return []
 #train_directory = r'C:\Users\writa\Downloads\Crypto'
 url = "https://huggingface.co/datasets/Writo/realestate_data/tree/main"