Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,28 +1,29 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from PyPDF2 import PdfReader
|
|
|
|
|
|
|
| 4 |
from langchain.text_splitter import CharacterTextSplitter
|
| 5 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
| 6 |
from langchain.vectorstores import FAISS
|
| 7 |
from langchain.chains.question_answering import load_qa_chain
|
| 8 |
from langchain.llms import OpenAI
|
| 9 |
-
import time
|
| 10 |
-
import logging
|
| 11 |
-
import pdfplumber
|
| 12 |
-
import os
|
| 13 |
-
import requests
|
| 14 |
-
from bs4 import BeautifulSoup
|
| 15 |
-
import docx # Importing docx for Word document processing
|
| 16 |
|
| 17 |
|
| 18 |
def fetch_and_process_pdf(url):
|
| 19 |
-
|
| 20 |
-
|
|
|
|
| 21 |
pdf_file = io.BytesIO(response.content)
|
| 22 |
text = process_pdf(pdf_file)
|
| 23 |
return text
|
| 24 |
-
|
| 25 |
-
logging.error(f"Failed to fetch PDF from {url}.
|
| 26 |
return ""
|
| 27 |
|
| 28 |
def process_pdf(pdf):
|
|
@@ -78,7 +79,17 @@ def read_documents_from_directory(directory):
|
|
| 78 |
combined_text += read_word(file_path)
|
| 79 |
return combined_text
|
| 80 |
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
#train_directory = r'C:\Users\writa\Downloads\Crypto'
|
| 83 |
url = "https://huggingface.co/datasets/Writo/realestate_data/tree/main"
|
| 84 |
|
|
@@ -90,15 +101,18 @@ def main():
|
|
| 90 |
# Ensure train_directory is accessible in Hugging Face Space
|
| 91 |
#text = read_documents_from_directory(train_directory)
|
| 92 |
|
| 93 |
-
def get_pdf_links_from_dataset(url):
|
| 94 |
-
response = requests.get(url)
|
| 95 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
| 96 |
-
pdf_links = [link.get('href') for link in soup.find_all('a') if link.get('href').endswith('.pdf')]
|
| 97 |
-
return pdf_links
|
| 98 |
-
|
| 99 |
dataset_url = 'https://huggingface.co/datasets/Writo/realestate_data/tree/main'
|
| 100 |
pdf_links = get_pdf_links_from_dataset(dataset_url)
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
# Processing text and setting up the AI model
|
| 104 |
char_text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000,
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
import logging
|
| 5 |
+
import io
|
| 6 |
+
import requests
|
| 7 |
+
from bs4 import BeautifulSoup
|
| 8 |
from PyPDF2 import PdfReader
|
| 9 |
+
import pdfplumber
|
| 10 |
+
import docx
|
| 11 |
from langchain.text_splitter import CharacterTextSplitter
|
| 12 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
| 13 |
from langchain.vectorstores import FAISS
|
| 14 |
from langchain.chains.question_answering import load_qa_chain
|
| 15 |
from langchain.llms import OpenAI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
def fetch_and_process_pdf(url):
|
| 19 |
+
try:
|
| 20 |
+
response = requests.get(url)
|
| 21 |
+
response.raise_for_status()
|
| 22 |
pdf_file = io.BytesIO(response.content)
|
| 23 |
text = process_pdf(pdf_file)
|
| 24 |
return text
|
| 25 |
+
except requests.HTTPError as e:
|
| 26 |
+
logging.error(f"Failed to fetch PDF from {url}. Error: {e}")
|
| 27 |
return ""
|
| 28 |
|
| 29 |
def process_pdf(pdf):
|
|
|
|
| 79 |
combined_text += read_word(file_path)
|
| 80 |
return combined_text
|
| 81 |
|
| 82 |
+
def get_pdf_links_from_dataset(url):
|
| 83 |
+
try:
|
| 84 |
+
response = requests.get(url)
|
| 85 |
+
response.raise_for_status()
|
| 86 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 87 |
+
pdf_links = [link.get('href') for link in soup.find_all('a') if '.pdf' in link.get('href')]
|
| 88 |
+
return pdf_links
|
| 89 |
+
except requests.HTTPError as e:
|
| 90 |
+
logging.error(f"Failed to get PDF links from dataset. Error: {e}")
|
| 91 |
+
return []
|
| 92 |
+
|
| 93 |
#train_directory = r'C:\Users\writa\Downloads\Crypto'
|
| 94 |
url = "https://huggingface.co/datasets/Writo/realestate_data/tree/main"
|
| 95 |
|
|
|
|
| 101 |
# Ensure train_directory is accessible in Hugging Face Space
|
| 102 |
#text = read_documents_from_directory(train_directory)
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
dataset_url = 'https://huggingface.co/datasets/Writo/realestate_data/tree/main'
|
| 105 |
pdf_links = get_pdf_links_from_dataset(dataset_url)
|
| 106 |
+
|
| 107 |
+
if pdf_links:
|
| 108 |
+
with st.spinner("Processing PDFs, please wait..."):
|
| 109 |
+
text = ""
|
| 110 |
+
for link in pdf_links:
|
| 111 |
+
text += fetch_and_process_pdf(link)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
text = read_documents_from_directory(train_directory)
|
| 116 |
|
| 117 |
# Processing text and setting up the AI model
|
| 118 |
char_text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000,
|