Spaces:

jasvir-singh1021
/

Easy-data-parser

Sleeping

App Files Files Community

Easy-data-parser / app.py

jasvir-singh1021

Update app.py

714614e verified 7 months ago

raw

history blame contribute delete

3.91 kB

	import streamlit as st
	import openai
	import os
	import json
	from io import StringIO
	from PyPDF2 import PdfReader
	from docx import Document
	import html2text

	# Optional: Prevent config issues on HF Spaces
	os.environ["STREAMLIT_CONFIG_DIR"] = "/tmp/.streamlit"

	# Configure Streamlit page
	st.set_page_config(page_title="Document Parser", layout="wide")

	# Session state to hold chat history
	if "conversation" not in st.session_state:
	st.session_state.conversation = []

	# Sidebar settings
	with st.sidebar:
	st.title("Settings")
	api_key = st.text_input("OpenAI API Key", type="password")
	temperature = st.slider("Temperature", 0.0, 1.0, 0.3, 0.1)

	# Main UI
	st.title("Document Parser")
	st.markdown("Upload documents and ask questions using GPT.")

	# File uploader
	uploaded_files = st.file_uploader(
	"Upload Documents (PDF, DOCX, TXT, HTML)",
	type=["pdf", "docx", "txt", "html"],
	accept_multiple_files=True
	)

	def extract_text(file):
	ext = file.name.lower().split(".")[-1]
	if ext == "pdf":
	reader = PdfReader(file)
	return "\n".join(page.extract_text() for page in reader.pages if page.extract_text())
	elif ext == "docx":
	doc = Document(file)
	return "\n".join([para.text for para in doc.paragraphs])
	elif ext == "txt":
	return file.read().decode("utf-8")
	elif ext == "html":
	return html2text.html2text(file.read().decode("utf-8"))
	else:
	return ""

	# Input field
	question = st.text_input("Ask a question about the uploaded documents:")

	# When "Ask" button is clicked
	if st.button("Ask") and uploaded_files and question and api_key:
	with st.spinner("Processing..."):

	# Extract and combine text from all uploaded files
	combined_text = ""
	for file in uploaded_files:
	combined_text += extract_text(file) + "\n"

	if not combined_text.strip():
	st.warning("Could not extract text from uploaded files.")
	else:
	try:
	openai.api_key = api_key
	response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=[
	{"role": "system", "content": "You are a helpful assistant that answers questions based on uploaded documents."},
	{"role": "user", "content": f"DOCUMENT:\n{combined_text[:6000]}\n\nQUESTION:\n{question}"}
	],
	temperature=temperature,
	)
	answer = response["choices"][0]["message"]["content"]

	# Update conversation history
	st.session_state.conversation.append({"role": "user", "content": question})
	st.session_state.conversation.append({"role": "assistant", "content": answer})

	except Exception as e:
	st.error(f"Error from OpenAI: {e}")

	# Display conversation
	if st.session_state.conversation:
	st.markdown("## Conversation")
	for msg in st.session_state.conversation:
	st.markdown(f"{'You' if msg['role'] == 'user' else 'Assistant'}: {msg['content']}")

	st.markdown("---")
	col1, col2 = st.columns(2)

	with col1:
	if st.button("Clear Conversation"):
	st.session_state.conversation = []
	st.experimental_rerun()

	with col2:
	format = st.selectbox("Download Format", ["TXT", "JSON"])
	if format == "TXT":
	content = "\n\n".join(
	f"{msg['role'].capitalize()}:\n{msg['content']}" for msg in st.session_state.conversation
	)
	mime = "text/plain"
	filename = "conversation.txt"
	else:
	content = json.dumps(st.session_state.conversation, indent=2)
	mime = "application/json"
	filename = "conversation.json"

	st.download_button("Download", content, file_name=filename, mime=mime)