# file_processing.py import PyPDF2 import docx import pandas as pd from io import BytesIO import streamlit as st MAX_FILE_SIZE_MB = 10 def validate_and_extract(uploaded_file): """ Checks size and extracts text. Returns (text, error_message) """ # 1. Size Check file_size_mb = uploaded_file.size / (1024 * 1024) if file_size_mb > MAX_FILE_SIZE_MB: return None, f"⚠️ File too large ({file_size_mb:.2f}MB). Limit is {MAX_FILE_SIZE_MB}MB. For larger files, please use the RAG system." # 2. Extract Text (Reuse previous logic) try: text = extract_text_from_file(uploaded_file) # Calling your internal function return text, None except Exception as e: return None, f"Error parsing file: {str(e)}" def extract_text_from_file(uploaded_file): """ detects file type and extracts text string """ file_type = uploaded_file.name.split('.')[-1].lower() text = "" try: # 1. Handle PDF if file_type == 'pdf': reader = PyPDF2.PdfReader(uploaded_file) for page in reader.pages: text += page.extract_text() + "\n" # 2. Handle Word (.docx) elif file_type in ['docx', 'doc']: doc = docx.Document(uploaded_file) for para in doc.paragraphs: text += para.text + "\n" # 3. Handle Excel/CSV elif file_type in ['csv', 'xlsx', 'xls']: if file_type == 'csv': df = pd.read_csv(uploaded_file) else: df = pd.read_excel(uploaded_file) # Convert dataframe to string representation text = df.to_string() # 4. Handle Plain Text / Markdown else: # decode bytes to string text = uploaded_file.read().decode("utf-8") except Exception as e: return f"Error reading file: {str(e)}" return text