Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,8 +10,21 @@ import time
|
|
| 10 |
import logging
|
| 11 |
import pdfplumber
|
| 12 |
import os
|
|
|
|
|
|
|
| 13 |
import docx # Importing docx for Word document processing
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def process_pdf(pdf):
|
| 16 |
start_time = time.time()
|
| 17 |
text = ""
|
|
@@ -66,7 +79,8 @@ def read_documents_from_directory(directory):
|
|
| 66 |
return combined_text
|
| 67 |
|
| 68 |
|
| 69 |
-
train_directory = r'C:\Users\writa\Downloads\Crypto'
|
|
|
|
| 70 |
|
| 71 |
def main():
|
| 72 |
load_dotenv()
|
|
@@ -74,7 +88,17 @@ def main():
|
|
| 74 |
st.header("🏢 EstateSphere")
|
| 75 |
|
| 76 |
# Ensure train_directory is accessible in Hugging Face Space
|
| 77 |
-
text = read_documents_from_directory(train_directory)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
# Processing text and setting up the AI model
|
| 80 |
char_text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000,
|
|
|
|
| 10 |
import logging
|
| 11 |
import pdfplumber
|
| 12 |
import os
|
| 13 |
+
import requests
|
| 14 |
+
from bs4 import BeautifulSoup
|
| 15 |
import docx # Importing docx for Word document processing
|
| 16 |
|
| 17 |
+
|
| 18 |
+
def fetch_and_process_pdf(url):
|
| 19 |
+
response = requests.get(url)
|
| 20 |
+
if response.status_code == 200:
|
| 21 |
+
pdf_file = io.BytesIO(response.content)
|
| 22 |
+
text = process_pdf(pdf_file)
|
| 23 |
+
return text
|
| 24 |
+
else:
|
| 25 |
+
logging.error(f"Failed to fetch PDF from {url}. Status Code: {response.status_code}")
|
| 26 |
+
return ""
|
| 27 |
+
|
| 28 |
def process_pdf(pdf):
|
| 29 |
start_time = time.time()
|
| 30 |
text = ""
|
|
|
|
| 79 |
return combined_text
|
| 80 |
|
| 81 |
|
| 82 |
+
#train_directory = r'C:\Users\writa\Downloads\Crypto'
|
| 83 |
+
dataset_url = "https://huggingface.co/datasets/Writo/realestate_data/tree/main"
|
| 84 |
|
| 85 |
def main():
|
| 86 |
load_dotenv()
|
|
|
|
| 88 |
st.header("🏢 EstateSphere")
|
| 89 |
|
| 90 |
# Ensure train_directory is accessible in Hugging Face Space
|
| 91 |
+
#text = read_documents_from_directory(train_directory)
|
| 92 |
+
|
| 93 |
+
def get_pdf_links_from_dataset(url):
|
| 94 |
+
response = requests.get(url)
|
| 95 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 96 |
+
pdf_links = [link.get('href') for link in soup.find_all('a') if link.get('href').endswith('.pdf')]
|
| 97 |
+
return pdf_links
|
| 98 |
+
|
| 99 |
+
dataset_url = 'https://huggingface.co/datasets/Writo/realestate_data/tree/main'
|
| 100 |
+
pdf_links = get_pdf_links_from_dataset(dataset_url)
|
| 101 |
+
print(pdf_links)
|
| 102 |
|
| 103 |
# Processing text and setting up the AI model
|
| 104 |
char_text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000,
|