jithenderchoudary commited on
Commit
8ff450e
·
verified ·
1 Parent(s): 47d2da2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -6
app.py CHANGED
@@ -2,28 +2,30 @@ import fitz # PyMuPDF
2
  import pandas as pd
3
  import gradio as gr
4
  import tempfile
 
5
 
6
  def extract_po_to_excel(pdf_file):
7
  try:
8
- # Load PDF and extract text
 
9
  with fitz.open(pdf_file.name) as pdf:
10
  data = []
 
 
11
  for page_num in range(pdf.page_count):
12
  page = pdf[page_num]
13
  text = page.get_text("text")
 
14
 
15
  # Simple example of extraction (customize parsing as needed)
16
  lines = text.splitlines()
17
  for line in lines:
18
- # Only extract lines with known keywords (sample logic; adjust as necessary)
19
  if "Pos." in line or "Item Code" in line:
20
  data.append(line)
21
 
22
  # Example structure, parse `data` into structured format
23
  structured_data = []
24
  for line in data:
25
- # Custom parsing logic goes here; here's a basic split by spaces
26
- # Adjust parsing to match your actual data needs
27
  parts = line.split()
28
  if len(parts) > 1:
29
  structured_data.append({
@@ -34,16 +36,20 @@ def extract_po_to_excel(pdf_file):
34
 
35
  # Create DataFrame and export to Excel
36
  df = pd.DataFrame(structured_data)
37
-
 
38
  # Save to temporary file
39
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
40
  df.to_excel(temp_file.name, index=False)
41
  temp_file.close()
 
42
 
43
  return temp_file.name
44
 
45
  except Exception as e:
46
- print(f"Error: {e}")
 
 
47
  return None
48
 
49
  def main(pdf_file):
@@ -66,3 +72,4 @@ if __name__ == "__main__":
66
  interface.launch()
67
 
68
 
 
 
2
  import pandas as pd
3
  import gradio as gr
4
  import tempfile
5
+ import traceback
6
 
7
  def extract_po_to_excel(pdf_file):
8
  try:
9
+ # Attempt to open and read the PDF file
10
+ print("Starting PDF extraction process.")
11
  with fitz.open(pdf_file.name) as pdf:
12
  data = []
13
+ print("PDF opened successfully.")
14
+
15
  for page_num in range(pdf.page_count):
16
  page = pdf[page_num]
17
  text = page.get_text("text")
18
+ print(f"Extracted text from page {page_num + 1}")
19
 
20
  # Simple example of extraction (customize parsing as needed)
21
  lines = text.splitlines()
22
  for line in lines:
 
23
  if "Pos." in line or "Item Code" in line:
24
  data.append(line)
25
 
26
  # Example structure, parse `data` into structured format
27
  structured_data = []
28
  for line in data:
 
 
29
  parts = line.split()
30
  if len(parts) > 1:
31
  structured_data.append({
 
36
 
37
  # Create DataFrame and export to Excel
38
  df = pd.DataFrame(structured_data)
39
+ print("DataFrame created successfully.")
40
+
41
  # Save to temporary file
42
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
43
  df.to_excel(temp_file.name, index=False)
44
  temp_file.close()
45
+ print(f"Excel file saved at {temp_file.name}")
46
 
47
  return temp_file.name
48
 
49
  except Exception as e:
50
+ # Capture and print the full traceback for debugging
51
+ print("An error occurred during PDF to Excel conversion.")
52
+ traceback.print_exc()
53
  return None
54
 
55
  def main(pdf_file):
 
72
  interface.launch()
73
 
74
 
75
+