NavyDevilDoc commited on
Commit
5b8cf94
·
verified ·
1 Parent(s): 6f1c390

Update src/parsers.py

Browse files
Files changed (1) hide show
  1. src/parsers.py +57 -24
src/parsers.py CHANGED
@@ -3,45 +3,78 @@ import docx
3
  import pandas as pd
4
  from pdf2image import convert_from_bytes
5
  import pytesseract
6
- import io
7
 
8
- def parse_file(uploaded_file):
9
- filename = uploaded_file.name
 
 
 
10
  text = ""
 
 
11
 
12
  try:
 
13
  if filename.endswith(".pdf"):
 
14
  reader = pypdf.PdfReader(uploaded_file)
15
- def parse_file(uploaded_file):
16
 
17
- for i, page in enumerate(reader.pages):
18
- extracted = page.extract_text()
19
- if extracted:
20
- text += f"\n[PAGE {i+1}] {extracted}"
21
-
22
- if len(text.strip()) < 50:
23
- method = "OCR (Slow)"
24
- images = convert_from_bytes(pdf_bytes)
25
- text = ""
26
- for i, img in enumerate(images):
27
- page_text = pytesseract.image_to_string(img)
28
- text += f"\n[PAGE {i+1}] {page_text}"
 
29
 
 
30
  elif filename.endswith(".docx"):
31
  doc = docx.Document(uploaded_file)
32
- text = "\n".join([p.text for p in doc.paragraphs])
33
-
 
34
  elif filename.endswith(".csv"):
35
- # NEW: CSV Handling
36
  df = pd.read_csv(uploaded_file)
37
- text = df.to_string(index=False) # Flattens table to string
 
38
 
39
  elif filename.endswith(".xlsx") or filename.endswith(".xls"):
40
- # NEW: Excel Handling
41
  df = pd.read_excel(uploaded_file)
42
  text = df.to_string(index=False)
 
 
 
 
 
43
 
44
- except Exception as e:
45
- return None, f"Error: {e}"
46
 
47
- return text, "Success"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import pandas as pd
4
  from pdf2image import convert_from_bytes
5
  import pytesseract
6
+ import uuid
7
 
8
+ def process_file(uploaded_file):
9
+ """
10
+ Input: Streamlit UploadedFile
11
+ Output: (full_text, filename, method)
12
+ """
13
  text = ""
14
+ filename = uploaded_file.name
15
+ method = "Fast Text"
16
 
17
  try:
18
+ # 1. PDF Handling
19
  if filename.endswith(".pdf"):
20
+ pdf_bytes = uploaded_file.getvalue()
21
  reader = pypdf.PdfReader(uploaded_file)
 
22
 
23
+ for i, page in enumerate(reader.pages):
24
+ extracted = page.extract_text()
25
+ if extracted:
26
+ text += f"\n[PAGE {i+1}] {extracted}"
27
+
28
+ # OCR Fallback
29
+ if len(text.strip()) < 50:
30
+ method = "OCR (Slow)"
31
+ images = convert_from_bytes(pdf_bytes)
32
+ text = ""
33
+ for i, img in enumerate(images):
34
+ page_text = pytesseract.image_to_string(img)
35
+ text += f"\n[PAGE {i+1}] {page_text}"
36
 
37
+ # 2. Word Handling
38
  elif filename.endswith(".docx"):
39
  doc = docx.Document(uploaded_file)
40
+ text = "\n".join([para.text for para in doc.paragraphs])
41
+
42
+ # 3. Excel/CSV Handling (NEW)
43
  elif filename.endswith(".csv"):
 
44
  df = pd.read_csv(uploaded_file)
45
+ text = df.to_string(index=False)
46
+ method = "Table Parse"
47
 
48
  elif filename.endswith(".xlsx") or filename.endswith(".xls"):
 
49
  df = pd.read_excel(uploaded_file)
50
  text = df.to_string(index=False)
51
+ method = "Table Parse"
52
+
53
+ # 4. Plain Text
54
+ elif filename.endswith(".txt"):
55
+ text = uploaded_file.read().decode("utf-8")
56
 
57
+ except Exception as e:
58
+ return "", filename, f"Error: {str(e)}"
59
 
60
+ return text, filename, method
61
+
62
+ def chunk_text(text, source, chunk_size=500, overlap=100):
63
+ """
64
+ Generates chunks AND assigns a unique doc_id to link them together.
65
+ """
66
+ words = text.split()
67
+ chunks = []
68
+ doc_id = str(uuid.uuid4()) # Generate ID once per document
69
+
70
+ for i in range(0, len(words), chunk_size - overlap):
71
+ chunk_text = " ".join(words[i:i + chunk_size])
72
+ if len(chunk_text) > 20: # Minimal filter
73
+ chunks.append({
74
+ "text": chunk_text,
75
+ "source": source,
76
+ "doc_id": doc_id,
77
+ "chunk_id": str(uuid.uuid4())
78
+ })
79
+
80
+ return chunks, doc_id