Spaces:

csAhmad
/

CV_Job_Matching_AI_Model

Sleeping

App Files Files Community

csAhmad commited on Apr 27

Commit

ca1e310

verified ·

1 Parent(s): b11809d

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -18

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import zipfile
 import os
 import pandas as pd
 from sentence_transformers import SentenceTransformer
 from pypdf import PdfReader
 import docx
@@ -13,23 +12,18 @@ EXTRACT_PATH = "temp/extracted"
 # -------------------------
-# TEXT EXTRACTORS
 # -------------------------
 def extract_text(file_path):
-    ext = file_path.lower()
-    # TXT
-    if ext.endswith(".txt"):
-        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
-            return f.read()
     # PDF
-    elif ext.endswith(".pdf"):
         reader = PdfReader(file_path)
-        return " ".join([page.extract_text() or "" for page in reader.pages])
     # DOCX
-    elif ext.endswith(".docx"):
         doc = docx.Document(file_path)
         return "\n".join([para.text for para in doc.paragraphs])
@@ -44,7 +38,7 @@ def process_zip(zip_file):
     if zip_file is None:
         raise gr.Error("Please upload a ZIP file.")
-    # clean folder
     if os.path.exists(EXTRACT_PATH):
         for root, dirs, files in os.walk(EXTRACT_PATH):
             for f in files:
@@ -57,7 +51,7 @@ def process_zip(zip_file):
     zip_path = zip_file.name
-    # extract ZIP
     try:
         with zipfile.ZipFile(zip_path, "r") as zip_ref:
             zip_ref.extractall(EXTRACT_PATH)
@@ -66,7 +60,7 @@ def process_zip(zip_file):
     results = []
-    # recursive scan
     for root, dirs, files in os.walk(EXTRACT_PATH):
         for file in files:
             file_path = os.path.join(root, file)
@@ -88,10 +82,10 @@ def process_zip(zip_file):
                 })
             except Exception as e:
-                print(f"Embedding error: {file_path} -> {e}")
     if len(results) == 0:
-        raise gr.Error("No readable TXT, PDF, or DOCX files found in ZIP.")
     df = pd.DataFrame(results)
@@ -108,8 +102,8 @@ demo = gr.Interface(
     fn=process_zip,
     inputs=gr.File(file_types=[".zip"]),
     outputs=gr.File(label="Download Excel"),
-    title="ZIP → Multiformat Embeddings",
-    description="Upload ZIP containing TXT, PDF, DOCX (even in folders). Generates embeddings and exports Excel."
 )
 demo.launch()

 import os
 import pandas as pd
 from sentence_transformers import SentenceTransformer
 from pypdf import PdfReader
 import docx
 # -------------------------
+# TEXT EXTRACTION
 # -------------------------
 def extract_text(file_path):
+    path = file_path.lower()
     # PDF
+    if path.endswith(".pdf"):
         reader = PdfReader(file_path)
+        return " ".join([p.extract_text() or "" for p in reader.pages])
     # DOCX
+    elif path.endswith(".docx"):
         doc = docx.Document(file_path)
         return "\n".join([para.text for para in doc.paragraphs])
     if zip_file is None:
         raise gr.Error("Please upload a ZIP file.")
+    # reset folder
     if os.path.exists(EXTRACT_PATH):
         for root, dirs, files in os.walk(EXTRACT_PATH):
             for f in files:
     zip_path = zip_file.name
+    # extract zip
     try:
         with zipfile.ZipFile(zip_path, "r") as zip_ref:
             zip_ref.extractall(EXTRACT_PATH)
     results = []
+    # 🔥 recursive scan (root + folders)
     for root, dirs, files in os.walk(EXTRACT_PATH):
         for file in files:
             file_path = os.path.join(root, file)
                 })
             except Exception as e:
+                print(f"Embedding failed for {file_path}: {e}")
     if len(results) == 0:
+        raise gr.Error("No readable PDF or DOCX files found in ZIP.")
     df = pd.DataFrame(results)
     fn=process_zip,
     inputs=gr.File(file_types=[".zip"]),
     outputs=gr.File(label="Download Excel"),
+    title="ZIP → PDF/DOCX Embeddings",
+    description="Upload ZIP containing PDFs and DOCX files (even inside folders). Outputs embeddings in Excel."
 )
 demo.launch()