Spaces:
Sleeping
Sleeping
File size: 5,041 Bytes
8dfb1ea fb92a02 4ae0ac5 0894705 fb92a02 e80b06b 8dfb1ea 4da0d83 fb92a02 4da0d83 fb92a02 4da0d83 8dfb1ea 4812fce e80b06b a16ebe6 e80b06b 00465b6 70e7fac 00465b6 8dfb1ea 70e7fac 48244f8 70e7fac fb92a02 8d5d86a fb92a02 8d5d86a fb92a02 4da0d83 bba9240 48244f8 8dfb1ea 5241587 00465b6 48244f8 4da0d83 bba9240 fb92a02 48244f8 8dfb1ea 48244f8 70e7fac 48244f8 70e7fac 8dfb1ea 48244f8 00465b6 8dfb1ea 00465b6 a16ebe6 70e7fac 3fa08e1 ff6332f 8dfb1ea 48244f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import streamlit as st
import os
import time
import logging
import io
import requests
from bs4 import BeautifulSoup
#from PyPDF2 import PdfReader
from dotenv import load_dotenv
import pdfplumber
import docx
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
def fetch_and_process_pdf(url):
try:
response = requests.get(url)
response.raise_for_status()
pdf_file = io.BytesIO(response.content)
text = process_pdf(pdf_file)
return text
except requests.HTTPError as e:
logging.error(f"Failed to fetch PDF from {url}. Error: {e}")
return ""
def process_pdf(pdf):
start_time = time.time()
text = ""
with pdfplumber.open(pdf) as pdf_reader:
for page in pdf_reader.pages:
text += page.extract_text() or ""
end_time = time.time()
logging.info(f"Processed PDF in {end_time - start_time} seconds")
return text
def display_chat_history():
if 'chat_history' not in st.session_state:
st.session_state.chat_history = []
history_text = ""
for chat in st.session_state.chat_history:
history_text += f"Q: {chat['question']}\nA: {chat['answer']}\n{chat['time']}\n---\n"
st.text_area("Chat History", history_text, height=300)
def update_chat_history(question, answer):
if 'chat_history' not in st.session_state:
st.session_state.chat_history = []
st.session_state.chat_history.append({
"question": question,
"answer": answer,
"time": time.strftime("%Y-%m-%d %H:%M:%S")
})
def read_pdf(file_path):
with open(file_path, "rb") as file:
pdf_reader = PdfReader(file)
text = ""
for page_num in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page_num].extract_text()
return text
def read_word(file_path):
doc = docx.Document(file_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
def read_documents_from_directory(directory):
combined_text = ""
for filename in os.listdir(directory):
file_path = os.path.join(directory, filename)
if filename.endswith(".pdf"):
combined_text += read_pdf(file_path)
elif filename.endswith(".docx"):
combined_text += read_word(file_path)
return combined_text
def get_pdf_links_from_dataset(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Define the base URL
base_url = "https://huggingface.co"
# Extract and construct absolute URLs
pdf_links = [base_url + link.get('href') for link in soup.find_all('a') if '.pdf' in link.get('href')]
return pdf_links
except requests.HTTPError as e:
logging.error(f"Failed to get PDF links from dataset. Error: {e}")
return []
#train_directory = r'C:\Users\writa\Downloads\Crypto'
url = "https://huggingface.co/datasets/Writo/realestate_data/tree/main"
def main():
load_dotenv()
st.set_page_config(page_title="EstateSphere")
st.header("🏢 EstateSphere")
# Ensure train_directory is accessible in Hugging Face Space
#text = read_documents_from_directory(train_directory)
dataset_url = 'https://huggingface.co/datasets/Writo/realestate_data/tree/main'
pdf_links = get_pdf_links_from_dataset(dataset_url)
if pdf_links:
with st.spinner("Processing PDFs, please wait..."):
text = ""
for link in pdf_links:
text += fetch_and_process_pdf(link)
# Processing text and setting up the AI model
char_text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000,
chunk_overlap=200, length_function=len)
text_chunks = char_text_splitter.split_text(text)
embeddings = OpenAIEmbeddings()
docsearch = FAISS.from_texts(text_chunks, embeddings)
llm = OpenAI()
chain = load_qa_chain(llm, chain_type="appropriate_type")
# Chat interface
query = st.text_input("Type your question:", key="query")
if query:
with st.spinner("Finding your answer..."):
try:
docs = docsearch.similarity_search(query)
response = chain.run(input_documents=docs, question=query)
update_chat_history(query, response)
display_chat_history()
except Exception as e:
st.error(f"An error occurred: {e}")
# Help and support in the sidebar
st.sidebar.header("Help & Support")
st.sidebar.write("Need assistance? Reach out to our support team.")
# Footer
st.sidebar.text("© 2024 MosaicAI")
update_chat_history(question, answer)
if __name__ == "__main__":
main()
|