csAhmad commited on
Commit
ca1e310
·
verified ·
1 Parent(s): b11809d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -18
app.py CHANGED
@@ -3,7 +3,6 @@ import zipfile
3
  import os
4
  import pandas as pd
5
  from sentence_transformers import SentenceTransformer
6
-
7
  from pypdf import PdfReader
8
  import docx
9
 
@@ -13,23 +12,18 @@ EXTRACT_PATH = "temp/extracted"
13
 
14
 
15
  # -------------------------
16
- # TEXT EXTRACTORS
17
  # -------------------------
18
  def extract_text(file_path):
19
- ext = file_path.lower()
20
-
21
- # TXT
22
- if ext.endswith(".txt"):
23
- with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
24
- return f.read()
25
 
26
  # PDF
27
- elif ext.endswith(".pdf"):
28
  reader = PdfReader(file_path)
29
- return " ".join([page.extract_text() or "" for page in reader.pages])
30
 
31
  # DOCX
32
- elif ext.endswith(".docx"):
33
  doc = docx.Document(file_path)
34
  return "\n".join([para.text for para in doc.paragraphs])
35
 
@@ -44,7 +38,7 @@ def process_zip(zip_file):
44
  if zip_file is None:
45
  raise gr.Error("Please upload a ZIP file.")
46
 
47
- # clean folder
48
  if os.path.exists(EXTRACT_PATH):
49
  for root, dirs, files in os.walk(EXTRACT_PATH):
50
  for f in files:
@@ -57,7 +51,7 @@ def process_zip(zip_file):
57
 
58
  zip_path = zip_file.name
59
 
60
- # extract ZIP
61
  try:
62
  with zipfile.ZipFile(zip_path, "r") as zip_ref:
63
  zip_ref.extractall(EXTRACT_PATH)
@@ -66,7 +60,7 @@ def process_zip(zip_file):
66
 
67
  results = []
68
 
69
- # recursive scan
70
  for root, dirs, files in os.walk(EXTRACT_PATH):
71
  for file in files:
72
  file_path = os.path.join(root, file)
@@ -88,10 +82,10 @@ def process_zip(zip_file):
88
  })
89
 
90
  except Exception as e:
91
- print(f"Embedding error: {file_path} -> {e}")
92
 
93
  if len(results) == 0:
94
- raise gr.Error("No readable TXT, PDF, or DOCX files found in ZIP.")
95
 
96
  df = pd.DataFrame(results)
97
 
@@ -108,8 +102,8 @@ demo = gr.Interface(
108
  fn=process_zip,
109
  inputs=gr.File(file_types=[".zip"]),
110
  outputs=gr.File(label="Download Excel"),
111
- title="ZIP → Multiformat Embeddings",
112
- description="Upload ZIP containing TXT, PDF, DOCX (even in folders). Generates embeddings and exports Excel."
113
  )
114
 
115
  demo.launch()
 
3
  import os
4
  import pandas as pd
5
  from sentence_transformers import SentenceTransformer
 
6
  from pypdf import PdfReader
7
  import docx
8
 
 
12
 
13
 
14
  # -------------------------
15
+ # TEXT EXTRACTION
16
  # -------------------------
17
  def extract_text(file_path):
18
+ path = file_path.lower()
 
 
 
 
 
19
 
20
  # PDF
21
+ if path.endswith(".pdf"):
22
  reader = PdfReader(file_path)
23
+ return " ".join([p.extract_text() or "" for p in reader.pages])
24
 
25
  # DOCX
26
+ elif path.endswith(".docx"):
27
  doc = docx.Document(file_path)
28
  return "\n".join([para.text for para in doc.paragraphs])
29
 
 
38
  if zip_file is None:
39
  raise gr.Error("Please upload a ZIP file.")
40
 
41
+ # reset folder
42
  if os.path.exists(EXTRACT_PATH):
43
  for root, dirs, files in os.walk(EXTRACT_PATH):
44
  for f in files:
 
51
 
52
  zip_path = zip_file.name
53
 
54
+ # extract zip
55
  try:
56
  with zipfile.ZipFile(zip_path, "r") as zip_ref:
57
  zip_ref.extractall(EXTRACT_PATH)
 
60
 
61
  results = []
62
 
63
+ # 🔥 recursive scan (root + folders)
64
  for root, dirs, files in os.walk(EXTRACT_PATH):
65
  for file in files:
66
  file_path = os.path.join(root, file)
 
82
  })
83
 
84
  except Exception as e:
85
+ print(f"Embedding failed for {file_path}: {e}")
86
 
87
  if len(results) == 0:
88
+ raise gr.Error("No readable PDF or DOCX files found in ZIP.")
89
 
90
  df = pd.DataFrame(results)
91
 
 
102
  fn=process_zip,
103
  inputs=gr.File(file_types=[".zip"]),
104
  outputs=gr.File(label="Download Excel"),
105
+ title="ZIP → PDF/DOCX Embeddings",
106
+ description="Upload ZIP containing PDFs and DOCX files (even inside folders). Outputs embeddings in Excel."
107
  )
108
 
109
  demo.launch()