PJ2005 commited on
Commit
a51bdd6
·
verified ·
1 Parent(s): e3ef681

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -0
app.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, io, re, traceback
2
+ from PIL import Image, ImageFilter, ImageOps
3
+ import pytesseract
4
+ import docx
5
+ import PyPDF2
6
+ import gradio as gr
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+ import nltk
10
+ from nltk.corpus import stopwords
11
+
12
+ # ---------------- NLTK Stopwords ----------------
13
+ try:
14
+ STOPWORDS = set(stopwords.words("english"))
15
+ except LookupError:
16
+ nltk.download("stopwords")
17
+ STOPWORDS = set(stopwords.words("english"))
18
+
19
+ # ---------------- Base Skills ----------------
20
+ BASE_SKILLS = [
21
+ "python", "machine learning", "data analysis", "pandas", "numpy", "nlp",
22
+ "deep learning", "tensorflow", "pytorch", "scikit-learn", "sql", "aws",
23
+ "docker", "git", "rest api", "computer vision", "opencv", "transformers"
24
+ ]
25
+
26
+ # ---------------- Optional PDF to Image ----------------
27
+ try:
28
+ from pdf2image import convert_from_bytes
29
+ PDF2IMAGE_AVAILABLE = True
30
+ except Exception:
31
+ PDF2IMAGE_AVAILABLE = False
32
+
33
+ # ---------------- Extraction ----------------
34
+ def extract_text_from_bytes(file_bytes, filename):
35
+ fname = (filename or "").lower()
36
+ text = ""
37
+ try:
38
+ if fname.endswith(".pdf"):
39
+ try:
40
+ reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
41
+ for page in reader.pages:
42
+ page_text = page.extract_text()
43
+ if page_text:
44
+ text += page_text + " "
45
+ except:
46
+ text = ""
47
+ if not text.strip() and PDF2IMAGE_AVAILABLE:
48
+ pages = convert_from_bytes(file_bytes, dpi=200)
49
+ for pg in pages:
50
+ pg = pg.convert("L").filter(ImageFilter.MedianFilter())
51
+ text += pytesseract.image_to_string(pg) + " "
52
+ elif fname.endswith(".docx") or fname.endswith(".doc"):
53
+ try:
54
+ doc = docx.Document(io.BytesIO(file_bytes))
55
+ text = "\n".join([p.text for p in doc.paragraphs])
56
+ except:
57
+ text = file_bytes.decode("utf-8", errors="ignore")
58
+ elif any(fname.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".bmp", ".tiff"]):
59
+ img = Image.open(io.BytesIO(file_bytes)).convert("RGB")
60
+ img = ImageOps.grayscale(img)
61
+ img = img.filter(ImageFilter.MedianFilter())
62
+ text = pytesseract.image_to_string(img)
63
+ elif fname.endswith(".txt"):
64
+ text = file_bytes.decode("utf-8", errors="ignore")
65
+ else:
66
+ try:
67
+ reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
68
+ for page in reader.pages:
69
+ page_text = page.extract_text()
70
+ if page_text:
71
+ text += page_text + " "
72
+ except:
73
+ pass
74
+ if not text.strip():
75
+ try:
76
+ img = Image.open(io.BytesIO(file_bytes)).convert("RGB")
77
+ img = ImageOps.grayscale(img)
78
+ text = pytesseract.image_to_string(img)
79
+ except:
80
+ try:
81
+ text = file_bytes.decode("utf-8", errors="ignore")
82
+ except:
83
+ text = ""
84
+ except Exception as e:
85
+ print("extract_text error:", e)
86
+ return ""
87
+ return text.strip()
88
+
89
+ # ---------------- Clean & Skills ----------------
90
+ def clean_text(text):
91
+ text = (text or "").lower()
92
+ text = re.sub(r"[^a-z0-9\s\-\.\@]", " ", text)
93
+ tokens = [w for w in text.split() if w not in STOPWORDS]
94
+ return " ".join(tokens)
95
+
96
+ def find_skills(text, custom_skills=[]):
97
+ skills = BASE_SKILLS + [s.strip().lower() for s in custom_skills if s.strip()]
98
+ text_low = (text or "").lower()
99
+ found = [s for s in skills if s in text_low]
100
+ return sorted(list(dict.fromkeys(found)))
101
+
102
+ def compute_similarity(resume_text, job_text):
103
+ if not job_text.strip() or not resume_text.strip():
104
+ return 0.0
105
+ corpus = [resume_text, job_text]
106
+ try:
107
+ vec = TfidfVectorizer().fit_transform(corpus)
108
+ sim = cosine_similarity(vec[0:1], vec[1:2])[0][0]
109
+ return float(sim * 100)
110
+ except Exception as e:
111
+ print("compute_similarity error:", e)
112
+ return 0.0
113
+
114
+ # ---------------- Main Function ----------------
115
+ def analyze(file, job_description, custom_input):
116
+ try:
117
+ if not file:
118
+ return "No file uploaded", "", "", 0.0, "Upload a file (PNG/JPG/PDF/DOCX/TXT)"
119
+
120
+ if isinstance(file, str):
121
+ path = file
122
+ filename = os.path.basename(path)
123
+ with open(path, "rb") as f:
124
+ file_bytes = f.read()
125
+ elif isinstance(file, dict):
126
+ filename = file.get("name") or file.get("filename") or "uploaded_file"
127
+ data = file.get("data") or file.get("tmp_path")
128
+ if isinstance(data, str) and os.path.exists(data):
129
+ with open(data, "rb") as f:
130
+ file_bytes = f.read()
131
+ elif isinstance(data, (bytes, bytearray)):
132
+ file_bytes = data
133
+ else:
134
+ file_bytes = b""
135
+ elif hasattr(file, "read"):
136
+ filename = getattr(file, "name", "uploaded_file")
137
+ file_bytes = file.read()
138
+ else:
139
+ return "Unsupported file object", "", "", 0.0, "Unsupported file object type"
140
+
141
+ text = extract_text_from_bytes(file_bytes, filename)
142
+ if not text:
143
+ return "Could not extract text from file", "", "", 0.0, "Try a clearer image or a different file type"
144
+
145
+ cleaned_resume = clean_text(text)
146
+ cleaned_job = clean_text(job_description or "")
147
+
148
+ custom_skills = [s.strip() for s in (custom_input or "").split(",") if s.strip()]
149
+ skills_found = find_skills(text, custom_skills)
150
+
151
+ score = compute_similarity(cleaned_resume, cleaned_job) if cleaned_job else 0.0
152
+
153
+ suggestions = f"Skills found: {', '.join(skills_found) if skills_found else 'None'}\nSimilarity score: {score:.2f}%"
154
+ short_preview = text[:2000] + ("..." if len(text) > 2000 else "")
155
+
156
+ return short_preview, cleaned_resume, ", ".join(skills_found), round(score, 2), suggestions
157
+
158
+ except Exception as e:
159
+ traceback.print_exc()
160
+ return "Error during analysis", "", "", 0.0, str(e)
161
+
162
+ # ---------------- Gradio UI ----------------
163
+ with gr.Blocks() as demo:
164
+ gr.Markdown("# ⚡ AI Resume Analyzer")
165
+ with gr.Row():
166
+ with gr.Column(scale=2):
167
+ file_input = gr.File(label="Upload Resume (PNG/JPG/PDF/DOCX/TXT)", file_count="single", type="filepath")
168
+ job_input = gr.Textbox(lines=4, label="Paste Job Description (optional)")
169
+ custom_skills = gr.Textbox(lines=2, label="Custom Skills (comma separated, optional)")
170
+ run_btn = gr.Button("Analyze Resume")
171
+ with gr.Column(scale=3):
172
+ output_preview = gr.Textbox(label="Extracted Text Preview")
173
+ output_clean = gr.Textbox(label="Cleaned Text")
174
+ output_skills = gr.Textbox(label="Detected Skills")
175
+ output_score = gr.Number(label="Match Score (%)")
176
+ output_suggest = gr.Textbox(label="Suggestions")
177
+
178
+ run_btn.click(fn=analyze, inputs=[file_input, job_input, custom_skills],
179
+ outputs=[output_preview, output_clean, output_skills, output_score, output_suggest])
180
+
181
+ if __name__ == "__main__":
182
+ demo.launch()