Writo commited on
Commit
4da0d83
·
1 Parent(s): d680019

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -2
app.py CHANGED
@@ -10,8 +10,21 @@ import time
10
  import logging
11
  import pdfplumber
12
  import os
 
 
13
  import docx # Importing docx for Word document processing
14
 
 
 
 
 
 
 
 
 
 
 
 
15
  def process_pdf(pdf):
16
  start_time = time.time()
17
  text = ""
@@ -66,7 +79,8 @@ def read_documents_from_directory(directory):
66
  return combined_text
67
 
68
 
69
- train_directory = r'C:\Users\writa\Downloads\Crypto'
 
70
 
71
  def main():
72
  load_dotenv()
@@ -74,7 +88,17 @@ def main():
74
  st.header("🏢 EstateSphere")
75
 
76
  # Ensure train_directory is accessible in Hugging Face Space
77
- text = read_documents_from_directory(train_directory)
 
 
 
 
 
 
 
 
 
 
78
 
79
  # Processing text and setting up the AI model
80
  char_text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000,
 
10
  import logging
11
  import pdfplumber
12
  import os
13
+ import requests
14
+ from bs4 import BeautifulSoup
15
  import docx # Importing docx for Word document processing
16
 
17
+
18
+ def fetch_and_process_pdf(url):
19
+ response = requests.get(url)
20
+ if response.status_code == 200:
21
+ pdf_file = io.BytesIO(response.content)
22
+ text = process_pdf(pdf_file)
23
+ return text
24
+ else:
25
+ logging.error(f"Failed to fetch PDF from {url}. Status Code: {response.status_code}")
26
+ return ""
27
+
28
  def process_pdf(pdf):
29
  start_time = time.time()
30
  text = ""
 
79
  return combined_text
80
 
81
 
82
+ #train_directory = r'C:\Users\writa\Downloads\Crypto'
83
+ dataset_url = "https://huggingface.co/datasets/Writo/realestate_data/tree/main"
84
 
85
  def main():
86
  load_dotenv()
 
88
  st.header("🏢 EstateSphere")
89
 
90
  # Ensure train_directory is accessible in Hugging Face Space
91
+ #text = read_documents_from_directory(train_directory)
92
+
93
+ def get_pdf_links_from_dataset(url):
94
+ response = requests.get(url)
95
+ soup = BeautifulSoup(response.text, 'html.parser')
96
+ pdf_links = [link.get('href') for link in soup.find_all('a') if link.get('href').endswith('.pdf')]
97
+ return pdf_links
98
+
99
+ dataset_url = 'https://huggingface.co/datasets/Writo/realestate_data/tree/main'
100
+ pdf_links = get_pdf_links_from_dataset(dataset_url)
101
+ print(pdf_links)
102
 
103
  # Processing text and setting up the AI model
104
  char_text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000,