NavyDevilDoc commited on
Commit
46dcfa5
·
verified ·
1 Parent(s): 0cff90b

Create file_processing.py

Browse files
Files changed (1) hide show
  1. file_processing.py +44 -0
file_processing.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # file_processing.py
2
+ import PyPDF2
3
+ import docx
4
+ import pandas as pd
5
+ from io import BytesIO
6
+
7
+ def extract_text_from_file(uploaded_file):
8
+ """
9
+ detects file type and extracts text string
10
+ """
11
+ file_type = uploaded_file.name.split('.')[-1].lower()
12
+ text = ""
13
+
14
+ try:
15
+ # 1. Handle PDF
16
+ if file_type == 'pdf':
17
+ reader = PyPDF2.PdfReader(uploaded_file)
18
+ for page in reader.pages:
19
+ text += page.extract_text() + "\n"
20
+
21
+ # 2. Handle Word (.docx)
22
+ elif file_type in ['docx', 'doc']:
23
+ doc = docx.Document(uploaded_file)
24
+ for para in doc.paragraphs:
25
+ text += para.text + "\n"
26
+
27
+ # 3. Handle Excel/CSV
28
+ elif file_type in ['csv', 'xlsx', 'xls']:
29
+ if file_type == 'csv':
30
+ df = pd.read_csv(uploaded_file)
31
+ else:
32
+ df = pd.read_excel(uploaded_file)
33
+ # Convert dataframe to string representation
34
+ text = df.to_string()
35
+
36
+ # 4. Handle Plain Text / Markdown
37
+ else:
38
+ # decode bytes to string
39
+ text = uploaded_file.read().decode("utf-8")
40
+
41
+ except Exception as e:
42
+ return f"Error reading file: {str(e)}"
43
+
44
+ return text