Spaces:
Sleeping
Sleeping
File size: 1,941 Bytes
46dcfa5 ff15ee0 46dcfa5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | # file_processing.py
import PyPDF2
import docx
import pandas as pd
from io import BytesIO
import streamlit as st
MAX_FILE_SIZE_MB = 10
def validate_and_extract(uploaded_file):
"""
Checks size and extracts text. Returns (text, error_message)
"""
# 1. Size Check
file_size_mb = uploaded_file.size / (1024 * 1024)
if file_size_mb > MAX_FILE_SIZE_MB:
return None, f"⚠️ File too large ({file_size_mb:.2f}MB). Limit is {MAX_FILE_SIZE_MB}MB. For larger files, please use the RAG system."
# 2. Extract Text (Reuse previous logic)
try:
text = extract_text_from_file(uploaded_file) # Calling your internal function
return text, None
except Exception as e:
return None, f"Error parsing file: {str(e)}"
def extract_text_from_file(uploaded_file):
"""
detects file type and extracts text string
"""
file_type = uploaded_file.name.split('.')[-1].lower()
text = ""
try:
# 1. Handle PDF
if file_type == 'pdf':
reader = PyPDF2.PdfReader(uploaded_file)
for page in reader.pages:
text += page.extract_text() + "\n"
# 2. Handle Word (.docx)
elif file_type in ['docx', 'doc']:
doc = docx.Document(uploaded_file)
for para in doc.paragraphs:
text += para.text + "\n"
# 3. Handle Excel/CSV
elif file_type in ['csv', 'xlsx', 'xls']:
if file_type == 'csv':
df = pd.read_csv(uploaded_file)
else:
df = pd.read_excel(uploaded_file)
# Convert dataframe to string representation
text = df.to_string()
# 4. Handle Plain Text / Markdown
else:
# decode bytes to string
text = uploaded_file.read().decode("utf-8")
except Exception as e:
return f"Error reading file: {str(e)}"
return text |