Writing_Assistant / file_processing.py
NavyDevilDoc's picture
Update file_processing.py
ff15ee0 verified
# file_processing.py
import PyPDF2
import docx
import pandas as pd
from io import BytesIO
import streamlit as st
MAX_FILE_SIZE_MB = 10
def validate_and_extract(uploaded_file):
"""
Checks size and extracts text. Returns (text, error_message)
"""
# 1. Size Check
file_size_mb = uploaded_file.size / (1024 * 1024)
if file_size_mb > MAX_FILE_SIZE_MB:
return None, f"⚠️ File too large ({file_size_mb:.2f}MB). Limit is {MAX_FILE_SIZE_MB}MB. For larger files, please use the RAG system."
# 2. Extract Text (Reuse previous logic)
try:
text = extract_text_from_file(uploaded_file) # Calling your internal function
return text, None
except Exception as e:
return None, f"Error parsing file: {str(e)}"
def extract_text_from_file(uploaded_file):
"""
detects file type and extracts text string
"""
file_type = uploaded_file.name.split('.')[-1].lower()
text = ""
try:
# 1. Handle PDF
if file_type == 'pdf':
reader = PyPDF2.PdfReader(uploaded_file)
for page in reader.pages:
text += page.extract_text() + "\n"
# 2. Handle Word (.docx)
elif file_type in ['docx', 'doc']:
doc = docx.Document(uploaded_file)
for para in doc.paragraphs:
text += para.text + "\n"
# 3. Handle Excel/CSV
elif file_type in ['csv', 'xlsx', 'xls']:
if file_type == 'csv':
df = pd.read_csv(uploaded_file)
else:
df = pd.read_excel(uploaded_file)
# Convert dataframe to string representation
text = df.to_string()
# 4. Handle Plain Text / Markdown
else:
# decode bytes to string
text = uploaded_file.read().decode("utf-8")
except Exception as e:
return f"Error reading file: {str(e)}"
return text