Spaces:
Sleeping
Sleeping
| # file_processing.py | |
| import PyPDF2 | |
| import docx | |
| import pandas as pd | |
| from io import BytesIO | |
| import streamlit as st | |
| MAX_FILE_SIZE_MB = 10 | |
| def validate_and_extract(uploaded_file): | |
| """ | |
| Checks size and extracts text. Returns (text, error_message) | |
| """ | |
| # 1. Size Check | |
| file_size_mb = uploaded_file.size / (1024 * 1024) | |
| if file_size_mb > MAX_FILE_SIZE_MB: | |
| return None, f"⚠️ File too large ({file_size_mb:.2f}MB). Limit is {MAX_FILE_SIZE_MB}MB. For larger files, please use the RAG system." | |
| # 2. Extract Text (Reuse previous logic) | |
| try: | |
| text = extract_text_from_file(uploaded_file) # Calling your internal function | |
| return text, None | |
| except Exception as e: | |
| return None, f"Error parsing file: {str(e)}" | |
| def extract_text_from_file(uploaded_file): | |
| """ | |
| detects file type and extracts text string | |
| """ | |
| file_type = uploaded_file.name.split('.')[-1].lower() | |
| text = "" | |
| try: | |
| # 1. Handle PDF | |
| if file_type == 'pdf': | |
| reader = PyPDF2.PdfReader(uploaded_file) | |
| for page in reader.pages: | |
| text += page.extract_text() + "\n" | |
| # 2. Handle Word (.docx) | |
| elif file_type in ['docx', 'doc']: | |
| doc = docx.Document(uploaded_file) | |
| for para in doc.paragraphs: | |
| text += para.text + "\n" | |
| # 3. Handle Excel/CSV | |
| elif file_type in ['csv', 'xlsx', 'xls']: | |
| if file_type == 'csv': | |
| df = pd.read_csv(uploaded_file) | |
| else: | |
| df = pd.read_excel(uploaded_file) | |
| # Convert dataframe to string representation | |
| text = df.to_string() | |
| # 4. Handle Plain Text / Markdown | |
| else: | |
| # decode bytes to string | |
| text = uploaded_file.read().decode("utf-8") | |
| except Exception as e: | |
| return f"Error reading file: {str(e)}" | |
| return text |