csAhmad commited on
Commit
b11809d
·
verified ·
1 Parent(s): 4eb565b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -38
app.py CHANGED
@@ -4,77 +4,95 @@ import os
4
  import pandas as pd
5
  from sentence_transformers import SentenceTransformer
6
 
7
- # Load model from HF
 
 
8
  model = SentenceTransformer("csAhmad/zoraiz-model")
9
 
10
- # Ensure temp directory exists
11
  EXTRACT_PATH = "temp/extracted"
12
- os.makedirs(EXTRACT_PATH, exist_ok=True)
13
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def process_zip(zip_file):
16
- # ---------------------------
17
- # 1. Validate input
18
- # ---------------------------
19
  if zip_file is None:
20
  raise gr.Error("Please upload a ZIP file.")
21
 
22
- # reset extraction folder
23
  if os.path.exists(EXTRACT_PATH):
24
- for f in os.listdir(EXTRACT_PATH):
25
- try:
26
- os.remove(os.path.join(EXTRACT_PATH, f))
27
- except:
28
- pass
 
29
 
30
  os.makedirs(EXTRACT_PATH, exist_ok=True)
31
 
32
  zip_path = zip_file.name
33
 
34
- # ---------------------------
35
- # 2. Extract ZIP safely
36
- # ---------------------------
37
  try:
38
  with zipfile.ZipFile(zip_path, "r") as zip_ref:
39
  zip_ref.extractall(EXTRACT_PATH)
40
  except zipfile.BadZipFile:
41
- raise gr.Error("Invalid ZIP file uploaded.")
42
 
43
- # ---------------------------
44
- # 3. Process files
45
- # ---------------------------
46
  results = []
47
 
 
48
  for root, dirs, files in os.walk(EXTRACT_PATH):
49
  for file in files:
50
  file_path = os.path.join(root, file)
51
 
52
- # only process text files
53
- if file.lower().endswith(".txt"):
54
- with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
55
- text = f.read()
56
 
57
- if not text.strip():
58
- continue
59
 
 
60
  emb = model.encode(text)
61
 
62
  results.append({
63
- "file": file,
 
64
  "text_length": len(text),
65
  "embedding_dim": len(emb),
66
  "preview_embedding": str(emb[:10])
67
  })
68
 
69
- # ---------------------------
70
- # 4. Handle empty results
71
- # ---------------------------
72
  if len(results) == 0:
73
- raise gr.Error("No valid .txt files found inside ZIP.")
74
 
75
- # ---------------------------
76
- # 5. Create Excel file
77
- # ---------------------------
78
  df = pd.DataFrame(results)
79
 
80
  output_file = "output.xlsx"
@@ -83,15 +101,15 @@ def process_zip(zip_file):
83
  return output_file
84
 
85
 
86
- # ---------------------------
87
- # 6. Gradio UI
88
- # ---------------------------
89
  demo = gr.Interface(
90
  fn=process_zip,
91
  inputs=gr.File(file_types=[".zip"]),
92
  outputs=gr.File(label="Download Excel"),
93
- title="ZIP → Embeddings → Excel Generator",
94
- description="Upload a ZIP containing .txt files. It will generate embeddings using SentenceTransformer and export to Excel."
95
  )
96
 
97
  demo.launch()
 
4
  import pandas as pd
5
  from sentence_transformers import SentenceTransformer
6
 
7
+ from pypdf import PdfReader
8
+ import docx
9
+
10
  model = SentenceTransformer("csAhmad/zoraiz-model")
11
 
 
12
  EXTRACT_PATH = "temp/extracted"
 
13
 
14
 
15
+ # -------------------------
16
+ # TEXT EXTRACTORS
17
+ # -------------------------
18
+ def extract_text(file_path):
19
+ ext = file_path.lower()
20
+
21
+ # TXT
22
+ if ext.endswith(".txt"):
23
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
24
+ return f.read()
25
+
26
+ # PDF
27
+ elif ext.endswith(".pdf"):
28
+ reader = PdfReader(file_path)
29
+ return " ".join([page.extract_text() or "" for page in reader.pages])
30
+
31
+ # DOCX
32
+ elif ext.endswith(".docx"):
33
+ doc = docx.Document(file_path)
34
+ return "\n".join([para.text for para in doc.paragraphs])
35
+
36
+ return None
37
+
38
+
39
+ # -------------------------
40
+ # MAIN FUNCTION
41
+ # -------------------------
42
  def process_zip(zip_file):
43
+
 
 
44
  if zip_file is None:
45
  raise gr.Error("Please upload a ZIP file.")
46
 
47
+ # clean folder
48
  if os.path.exists(EXTRACT_PATH):
49
+ for root, dirs, files in os.walk(EXTRACT_PATH):
50
+ for f in files:
51
+ try:
52
+ os.remove(os.path.join(root, f))
53
+ except:
54
+ pass
55
 
56
  os.makedirs(EXTRACT_PATH, exist_ok=True)
57
 
58
  zip_path = zip_file.name
59
 
60
+ # extract ZIP
 
 
61
  try:
62
  with zipfile.ZipFile(zip_path, "r") as zip_ref:
63
  zip_ref.extractall(EXTRACT_PATH)
64
  except zipfile.BadZipFile:
65
+ raise gr.Error("Invalid ZIP file.")
66
 
 
 
 
67
  results = []
68
 
69
+ # recursive scan
70
  for root, dirs, files in os.walk(EXTRACT_PATH):
71
  for file in files:
72
  file_path = os.path.join(root, file)
73
 
74
+ text = extract_text(file_path)
 
 
 
75
 
76
+ if not text or not text.strip():
77
+ continue
78
 
79
+ try:
80
  emb = model.encode(text)
81
 
82
  results.append({
83
+ "file": os.path.relpath(file_path, EXTRACT_PATH),
84
+ "type": file.split(".")[-1],
85
  "text_length": len(text),
86
  "embedding_dim": len(emb),
87
  "preview_embedding": str(emb[:10])
88
  })
89
 
90
+ except Exception as e:
91
+ print(f"Embedding error: {file_path} -> {e}")
92
+
93
  if len(results) == 0:
94
+ raise gr.Error("No readable TXT, PDF, or DOCX files found in ZIP.")
95
 
 
 
 
96
  df = pd.DataFrame(results)
97
 
98
  output_file = "output.xlsx"
 
101
  return output_file
102
 
103
 
104
+ # -------------------------
105
+ # GRADIO UI
106
+ # -------------------------
107
  demo = gr.Interface(
108
  fn=process_zip,
109
  inputs=gr.File(file_types=[".zip"]),
110
  outputs=gr.File(label="Download Excel"),
111
+ title="ZIP → Multiformat Embeddings",
112
+ description="Upload ZIP containing TXT, PDF, DOCX (even in folders). Generates embeddings and exports Excel."
113
  )
114
 
115
  demo.launch()