Spaces:

NavyDevilDoc
/

Writing_Assistant

Sleeping

App Files Files Community

Writing_Assistant / file_processing.py

NavyDevilDoc

Update file_processing.py

ff15ee0 verified 3 months ago

raw

history blame contribute delete

1.94 kB

	# file_processing.py
	import PyPDF2
	import docx
	import pandas as pd
	from io import BytesIO
	import streamlit as st

	MAX_FILE_SIZE_MB = 10

	def validate_and_extract(uploaded_file):
	"""
	Checks size and extracts text. Returns (text, error_message)
	"""
	# 1. Size Check
	file_size_mb = uploaded_file.size / (1024 * 1024)
	if file_size_mb > MAX_FILE_SIZE_MB:
	return None, f"⚠️ File too large ({file_size_mb:.2f}MB). Limit is {MAX_FILE_SIZE_MB}MB. For larger files, please use the RAG system."

	# 2. Extract Text (Reuse previous logic)
	try:
	text = extract_text_from_file(uploaded_file) # Calling your internal function
	return text, None
	except Exception as e:
	return None, f"Error parsing file: {str(e)}"

	def extract_text_from_file(uploaded_file):
	"""
	detects file type and extracts text string
	"""
	file_type = uploaded_file.name.split('.')[-1].lower()
	text = ""

	try:
	# 1. Handle PDF
	if file_type == 'pdf':
	reader = PyPDF2.PdfReader(uploaded_file)
	for page in reader.pages:
	text += page.extract_text() + "\n"

	# 2. Handle Word (.docx)
	elif file_type in ['docx', 'doc']:
	doc = docx.Document(uploaded_file)
	for para in doc.paragraphs:
	text += para.text + "\n"

	# 3. Handle Excel/CSV
	elif file_type in ['csv', 'xlsx', 'xls']:
	if file_type == 'csv':
	df = pd.read_csv(uploaded_file)
	else:
	df = pd.read_excel(uploaded_file)
	# Convert dataframe to string representation
	text = df.to_string()

	# 4. Handle Plain Text / Markdown
	else:
	# decode bytes to string
	text = uploaded_file.read().decode("utf-8")

	except Exception as e:
	return f"Error reading file: {str(e)}"

	return text