sid22669 commited on
Commit
2231637
·
verified ·
1 Parent(s): 7132f90

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -22
app.py CHANGED
@@ -6,36 +6,38 @@ import pandas as pd
6
  import os
7
  import uuid
8
  from datetime import datetime
9
- from docx import Document
10
  import tempfile
 
11
 
12
  # Load model and vectorizer
13
  classifier_model = joblib.load('resume_classifier')
14
  resume_vectorizer = joblib.load('resume_vectorizer')
15
 
16
 
17
- def read_file(file_path):
18
- try:
19
- ext = os.path.splitext(file_path)[1].lower()
20
 
 
21
  if ext == ".pdf":
22
- with open(file_path, "rb") as file:
23
- reader = PyPDF2.PdfReader(file)
24
- text = ""
25
- for page in reader.pages:
26
- page_text = page.extract_text()
27
- if page_text:
28
- text += page_text + "\n"
29
- return text.strip()
30
 
31
  elif ext == ".txt":
32
- with open(file_path, "r", encoding="utf-8") as file:
33
- return file.read().strip()
34
 
35
  elif ext in [".doc", ".docx"]:
36
  try:
37
  import textract
38
- text = textract.process(file_path)
 
 
 
 
39
  return text.decode("utf-8").strip()
40
  except Exception as e:
41
  return f"Error reading Word file with textract: {str(e)}"
@@ -84,10 +86,8 @@ uploaded_file = st.file_uploader(
84
  )
85
 
86
  if uploaded_file:
87
- # Save uploaded file to a temp file in /tmp
88
- with tempfile.NamedTemporaryFile(delete=False, dir="/tmp", suffix=os.path.splitext(uploaded_file.name)[1]) as temp_file:
89
- temp_file.write(uploaded_file.read())
90
- temp_path = temp_file.name
91
 
92
  # Track upload session
93
  if (
@@ -98,11 +98,10 @@ if uploaded_file:
98
  st.session_state.serial_id = str(uuid.uuid4())
99
  st.session_state.corrected_prediction = None
100
 
101
- extracted_text = read_file(temp_path)
102
- os.remove(temp_path)
103
 
104
  if "Error" in extracted_text or not extracted_text.strip():
105
- st.warning("Could not extract text from the uploaded file.")
106
  else:
107
  cleaned_text = clean_resume(extracted_text)
108
  new_input = resume_vectorizer.transform([cleaned_text])
 
6
  import os
7
  import uuid
8
  from datetime import datetime
 
9
  import tempfile
10
+ from io import BytesIO
11
 
12
  # Load model and vectorizer
13
  classifier_model = joblib.load('resume_classifier')
14
  resume_vectorizer = joblib.load('resume_vectorizer')
15
 
16
 
17
+ def read_uploaded_file(uploaded_file):
18
+ ext = os.path.splitext(uploaded_file.name)[1].lower()
 
19
 
20
+ try:
21
  if ext == ".pdf":
22
+ reader = PyPDF2.PdfReader(uploaded_file)
23
+ text = ""
24
+ for page in reader.pages:
25
+ page_text = page.extract_text()
26
+ if page_text:
27
+ text += page_text + "\n"
28
+ return text.strip()
 
29
 
30
  elif ext == ".txt":
31
+ return uploaded_file.read().decode("utf-8").strip()
 
32
 
33
  elif ext in [".doc", ".docx"]:
34
  try:
35
  import textract
36
+ with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
37
+ tmp.write(uploaded_file.read())
38
+ tmp_path = tmp.name
39
+ text = textract.process(tmp_path)
40
+ os.remove(tmp_path)
41
  return text.decode("utf-8").strip()
42
  except Exception as e:
43
  return f"Error reading Word file with textract: {str(e)}"
 
86
  )
87
 
88
  if uploaded_file:
89
+ # Reset the file read pointer in case it was read earlier
90
+ uploaded_file.seek(0)
 
 
91
 
92
  # Track upload session
93
  if (
 
98
  st.session_state.serial_id = str(uuid.uuid4())
99
  st.session_state.corrected_prediction = None
100
 
101
+ extracted_text = read_uploaded_file(uploaded_file)
 
102
 
103
  if "Error" in extracted_text or not extracted_text.strip():
104
+ st.warning("⚠️ Could not extract text from the uploaded file.")
105
  else:
106
  cleaned_text = clean_resume(extracted_text)
107
  new_input = resume_vectorizer.transform([cleaned_text])