Writo commited on
Commit
fb92a02
·
1 Parent(s): 160d5f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -20
app.py CHANGED
@@ -1,28 +1,29 @@
1
  import streamlit as st
2
- from dotenv import load_dotenv
 
 
 
 
 
3
  from PyPDF2 import PdfReader
 
 
4
  from langchain.text_splitter import CharacterTextSplitter
5
  from langchain.embeddings.openai import OpenAIEmbeddings
6
  from langchain.vectorstores import FAISS
7
  from langchain.chains.question_answering import load_qa_chain
8
  from langchain.llms import OpenAI
9
- import time
10
- import logging
11
- import pdfplumber
12
- import os
13
- import requests
14
- from bs4 import BeautifulSoup
15
- import docx # Importing docx for Word document processing
16
 
17
 
18
  def fetch_and_process_pdf(url):
19
- response = requests.get(url)
20
- if response.status_code == 200:
 
21
  pdf_file = io.BytesIO(response.content)
22
  text = process_pdf(pdf_file)
23
  return text
24
- else:
25
- logging.error(f"Failed to fetch PDF from {url}. Status Code: {response.status_code}")
26
  return ""
27
 
28
  def process_pdf(pdf):
@@ -78,7 +79,17 @@ def read_documents_from_directory(directory):
78
  combined_text += read_word(file_path)
79
  return combined_text
80
 
81
-
 
 
 
 
 
 
 
 
 
 
82
  #train_directory = r'C:\Users\writa\Downloads\Crypto'
83
  url = "https://huggingface.co/datasets/Writo/realestate_data/tree/main"
84
 
@@ -90,15 +101,18 @@ def main():
90
  # Ensure train_directory is accessible in Hugging Face Space
91
  #text = read_documents_from_directory(train_directory)
92
 
93
- def get_pdf_links_from_dataset(url):
94
- response = requests.get(url)
95
- soup = BeautifulSoup(response.text, 'html.parser')
96
- pdf_links = [link.get('href') for link in soup.find_all('a') if link.get('href').endswith('.pdf')]
97
- return pdf_links
98
-
99
  dataset_url = 'https://huggingface.co/datasets/Writo/realestate_data/tree/main'
100
  pdf_links = get_pdf_links_from_dataset(dataset_url)
101
- print(pdf_links)
 
 
 
 
 
 
 
 
 
102
 
103
  # Processing text and setting up the AI model
104
  char_text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000,
 
1
  import streamlit as st
2
+ import os
3
+ import time
4
+ import logging
5
+ import io
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
  from PyPDF2 import PdfReader
9
+ import pdfplumber
10
+ import docx
11
  from langchain.text_splitter import CharacterTextSplitter
12
  from langchain.embeddings.openai import OpenAIEmbeddings
13
  from langchain.vectorstores import FAISS
14
  from langchain.chains.question_answering import load_qa_chain
15
  from langchain.llms import OpenAI
 
 
 
 
 
 
 
16
 
17
 
18
  def fetch_and_process_pdf(url):
19
+ try:
20
+ response = requests.get(url)
21
+ response.raise_for_status()
22
  pdf_file = io.BytesIO(response.content)
23
  text = process_pdf(pdf_file)
24
  return text
25
+ except requests.HTTPError as e:
26
+ logging.error(f"Failed to fetch PDF from {url}. Error: {e}")
27
  return ""
28
 
29
  def process_pdf(pdf):
 
79
  combined_text += read_word(file_path)
80
  return combined_text
81
 
82
+ def get_pdf_links_from_dataset(url):
83
+ try:
84
+ response = requests.get(url)
85
+ response.raise_for_status()
86
+ soup = BeautifulSoup(response.text, 'html.parser')
87
+ pdf_links = [link.get('href') for link in soup.find_all('a') if '.pdf' in link.get('href')]
88
+ return pdf_links
89
+ except requests.HTTPError as e:
90
+ logging.error(f"Failed to get PDF links from dataset. Error: {e}")
91
+ return []
92
+
93
  #train_directory = r'C:\Users\writa\Downloads\Crypto'
94
  url = "https://huggingface.co/datasets/Writo/realestate_data/tree/main"
95
 
 
101
  # Ensure train_directory is accessible in Hugging Face Space
102
  #text = read_documents_from_directory(train_directory)
103
 
 
 
 
 
 
 
104
  dataset_url = 'https://huggingface.co/datasets/Writo/realestate_data/tree/main'
105
  pdf_links = get_pdf_links_from_dataset(dataset_url)
106
+
107
+ if pdf_links:
108
+ with st.spinner("Processing PDFs, please wait..."):
109
+ text = ""
110
+ for link in pdf_links:
111
+ text += fetch_and_process_pdf(link)
112
+
113
+
114
+
115
+ text = read_documents_from_directory(train_directory)
116
 
117
  # Processing text and setting up the AI model
118
  char_text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000,