Spaces:

izhan001
/

Smart-Doc-Processor

Build error

App Files Files Community

Smart-Doc-Processor / app.py

izhan001

Update app.py

781ba9f verified about 1 year ago

raw

history blame contribute delete

5.94 kB

	import gradio as gr
	import docx
	import PyPDF2
	from pptx import Presentation
	from transformers import pipeline
	from docx import Document
	from io import BytesIO
	import tempfile

	# Initialize Hugging Face models for summarization, rephrasing, and sentiment analysis
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
	rephraser = pipeline("text2text-generation", model="Vamsi/T5_Paraphrase_Paws", max_length=512, truncation=True)
	sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

	# Function to read content from different file types
	def read_file(file, file_type):
	content = ""
	try:
	if file_type == "docx":
	doc = Document(file)
	for para in doc.paragraphs:
	content += para.text + "\n"
	elif file_type == "txt":
	content = file.read().decode("utf-8")
	elif file_type == "pdf":
	pdf_reader = PyPDF2.PdfReader(file)
	for page in pdf_reader.pages:
	content += page.extract_text() + "\n"
	elif file_type == "pptx":
	prs = Presentation(file)
	for slide in prs.slides:
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	content += shape.text + "\n"
	except Exception as e:
	content = f"Error reading the file: {str(e)}"

	return content

	# Function to process the file and generate outputs
	def process_file(file, file_type, language="en"):
	content = read_file(file, file_type)

	# Check if content is not empty
	if not content.strip() or "Error" in content:
	return "Error: The document is empty or unsupported format.", None, None, None, None, None

	# Summarize the content
	try:
	summary = summarizer(content, max_length=150, min_length=50, do_sample=False)
	summary_text = summary[0]['summary_text']
	except Exception as e:
	summary_text = f"Summary Error: {str(e)}"

	# Rephrase the entire content in manageable chunks
	rephrased_text = ""
	try:
	chunk_size = 500
	content_chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
	for chunk in content_chunks:
	rephrased = rephraser(chunk)
	rephrased_text += rephrased[0]['generated_text'] + " "
	except Exception as e:
	rephrased_text = f"Rephrase Error: {str(e)}"

	# Sentiment analysis
	try:
	sentiment = sentiment_analyzer(content[:512])
	sentiment_text = sentiment[0]['label']
	except Exception as e:
	sentiment_text = f"Sentiment Analysis Error: {str(e)}"

	# Extract keywords (for simplicity, extracting words here, but you can replace this with a better method)
	keywords = ' '.join([word for word in content.split()[:10]])

	# Saving processed file (for download link)
	try:
	with tempfile.NamedTemporaryFile(delete=False, suffix='.txt') as temp_file:
	temp_file.write(content.encode('utf-8'))
	processed_file_path = temp_file.name
	except Exception as e:
	processed_file_path = f"Error saving processed document: {str(e)}"

	return content, rephrased_text.strip(), summary_text, sentiment_text, keywords, processed_file_path

	# Define the functions for the different pages
	def home_page():
	with gr.Blocks() as home:
	# Header
	gr.Markdown("## Upload a Document to Process")

	# Menu bar as buttons
	with gr.Row():
	home_btn = gr.Button("Home")
	full_analysis_btn = gr.Button("Full Analysis", variant="primary")

	# Display content on home page
	gr.Markdown("Welcome to the Document Processor!")
	gr.Markdown("Upload your document here and click to view details on the 'Full Analysis' page.")

	# File upload and content output
	file_input = gr.File(label="Upload Document")
	content_output = gr.Textbox(label="Original Content")
	rephrased_output = gr.Textbox(label="Rephrased Content")

	def on_file_upload(file):
	if not file:
	return "No file uploaded.", None
	content, rephrased, _, _, _, _ = process_file(file, file_type="docx")
	return content, rephrased

	# Process file on upload
	file_input.change(on_file_upload, inputs=file_input, outputs=[content_output, rephrased_output])

	return home

	def detailed_page():
	with gr.Blocks() as detailed:
	# Header
	gr.Markdown("## Detailed Analysis Page")

	# Menu bar as buttons
	with gr.Row():
	home_btn = gr.Button("Home", variant="primary")
	full_analysis_btn = gr.Button("Full Analysis")

	# File upload and processing components
	file_input = gr.File(label="Upload Document")
	file_type = gr.Dropdown(["pdf", "docx", "txt", "pptx"], label="File Type")
	keywords_output = gr.Textbox(label="Keywords")
	sentiment_output = gr.Textbox(label="Sentiment Analysis")
	download_link = gr.File(label="Download Processed Document")

	def on_file_upload(file, file_type):
	if not file:
	return "No file uploaded.", None, None, None
	_, _, _, sentiment, keywords, download_path = process_file(file, file_type)
	return keywords, sentiment, download_path

	# Process file on upload
	file_input.change(on_file_upload, inputs=[file_input, file_type], outputs=[keywords_output, sentiment_output, download_link])

	# Sample output or content for the detailed analysis page
	gr.Markdown("Here you will see detailed analysis outputs after document upload.")

	return detailed

	# Main application interface with tabbed navigation
	iface = gr.TabbedInterface([home_page(), detailed_page()], ["Home", "Full Analysis"])
	iface.launch()