File size: 1,941 Bytes
46dcfa5
 
 
 
 
ff15ee0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46dcfa5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# file_processing.py
import PyPDF2
import docx
import pandas as pd
from io import BytesIO
import streamlit as st

MAX_FILE_SIZE_MB = 10

def validate_and_extract(uploaded_file):
    """
    Checks size and extracts text. Returns (text, error_message)
    """
    # 1. Size Check
    file_size_mb = uploaded_file.size / (1024 * 1024)
    if file_size_mb > MAX_FILE_SIZE_MB:
        return None, f"⚠️ File too large ({file_size_mb:.2f}MB). Limit is {MAX_FILE_SIZE_MB}MB. For larger files, please use the RAG system."

    # 2. Extract Text (Reuse previous logic)
    try:
        text = extract_text_from_file(uploaded_file) # Calling your internal function
        return text, None
    except Exception as e:
        return None, f"Error parsing file: {str(e)}"

def extract_text_from_file(uploaded_file):
    """
    detects file type and extracts text string
    """
    file_type = uploaded_file.name.split('.')[-1].lower()
    text = ""

    try:
        # 1. Handle PDF
        if file_type == 'pdf':
            reader = PyPDF2.PdfReader(uploaded_file)
            for page in reader.pages:
                text += page.extract_text() + "\n"
        
        # 2. Handle Word (.docx)
        elif file_type in ['docx', 'doc']:
            doc = docx.Document(uploaded_file)
            for para in doc.paragraphs:
                text += para.text + "\n"

        # 3. Handle Excel/CSV
        elif file_type in ['csv', 'xlsx', 'xls']:
            if file_type == 'csv':
                df = pd.read_csv(uploaded_file)
            else:
                df = pd.read_excel(uploaded_file)
            # Convert dataframe to string representation
            text = df.to_string()

        # 4. Handle Plain Text / Markdown
        else:
            # decode bytes to string
            text = uploaded_file.read().decode("utf-8")
            
    except Exception as e:
        return f"Error reading file: {str(e)}"

    return text