bangaboy commited on
Commit
ea88d49
·
verified ·
1 Parent(s): 4580e9c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +221 -0
app.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install streamlit google-generativeai pymupdf pyngrok transformers spacy python-docx nltk dateparser
2
+ !python -m spacy download en_core_web_sm
3
+ !python -m nltk.downloader words
4
+
5
+ %%writefile combined_resume_analyzer.py
6
+
7
+
8
+
9
+ import google.generativeai as genai
10
+ import fitz # PyMuPDF for PDF text extraction
11
+ import streamlit as st
12
+ import spacy
13
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
14
+ from docx import Document
15
+ import re
16
+ from nltk.corpus import words
17
+ import dateparser
18
+ from datetime import datetime
19
+ from pyngrok import ngrok
20
+ import os
21
+
22
+ # Load SpaCy model for dependency parsing
23
+ nlp_spacy = spacy.load('en_core_web_sm')
24
+
25
+ # Load the NER model
26
+ tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
27
+ model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
28
+ nlp_ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
29
+
30
+ english_words = set(words.words())
31
+
32
+ # Function to authenticate with Gemini API
33
+ def authenticate_gemini(api_key):
34
+ try:
35
+ genai.configure(api_key=api_key)
36
+ model = genai.GenerativeModel(model_name="gemini-1.5-flash-latest")
37
+ st.success("Gemini API successfully configured.")
38
+ return model
39
+ except Exception as e:
40
+ st.error(f"Error configuring Gemini API: {e}")
41
+ return None
42
+
43
+ # Function to filter and refine extracted ORG entities
44
+ def refine_org_entities(entities):
45
+ refined_entities = set()
46
+ company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.']
47
+
48
+ for entity in entities:
49
+ if any(entity.endswith(suffix) for suffix in company_suffixes):
50
+ refined_entities.add(entity)
51
+ elif re.match(r'([A-Z][a-z]+)\s([A-Z][a-z]+)', entity):
52
+ refined_entities.add(entity)
53
+ return list(refined_entities)
54
+
55
+ # Function to extract ORG entities using NER
56
+ def extract_orgs(text):
57
+ ner_results = nlp_ner(text)
58
+ orgs = set()
59
+ for entity in ner_results:
60
+ if entity['entity_group'] == 'ORG':
61
+ orgs.add(entity['word'])
62
+
63
+ return refine_org_entities(orgs)
64
+
65
+ # Extract text from PDF
66
+ def extract_text_from_pdf(pdf_file):
67
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
68
+ text = ""
69
+ for page_num in range(doc.page_count):
70
+ page = doc.load_page(page_num)
71
+ text += page.get_text()
72
+ return text
73
+
74
+ # Extract text from DOCX
75
+ def extract_text_from_doc(doc_file):
76
+ doc = Document(doc_file)
77
+ text = '\n'.join([para.text for para in doc.paragraphs])
78
+ return text
79
+
80
+ # Summary generation function
81
+ def generate_summary(text, model):
82
+ prompt = f"Can you summarize the following document in 100 words?\n\n{text}"
83
+ try:
84
+ response = model.generate_content(prompt)
85
+ return response.text
86
+ except Exception as e:
87
+ return f"Error generating summary: {str(e)}"
88
+
89
+ # Additional resume parsing functions
90
+ def extract_experience(doc):
91
+ experience = 0
92
+ for ent in doc.ents:
93
+ if ent.label_ == "DATE":
94
+ date = dateparser.parse(ent.text)
95
+ if date:
96
+ experience = max(experience, datetime.now().year - date.year)
97
+ return experience
98
+
99
+ def extract_phone(text):
100
+ phone_patterns = [
101
+ r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
102
+ r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
103
+ ]
104
+ for pattern in phone_patterns:
105
+ match = re.search(pattern, text)
106
+ if match:
107
+ return match.group()
108
+ return "Not found"
109
+
110
+ def extract_email(text):
111
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
112
+ match = re.search(email_pattern, text)
113
+ return match.group() if match else "Not found"
114
+
115
+ def extract_colleges(doc):
116
+ colleges = set()
117
+ edu_keywords = ["university", "college", "institute", "school"]
118
+ for ent in doc.ents:
119
+ if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords):
120
+ colleges.add(ent.text)
121
+ return list(colleges)
122
+
123
+ def extract_linkedin(text):
124
+ linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
125
+ match = re.search(linkedin_pattern, text)
126
+ return match.group() if match else "Not found"
127
+
128
+ # Main function to process the resume and return the analysis
129
+ def main():
130
+ st.title("Comprehensive Resume Analyzer")
131
+ st.write("Upload a resume to extract information, generate a summary, and analyze details.")
132
+
133
+ # Input for API key
134
+ api_key = st.text_input("Enter your Google Gemini API key", type="password")
135
+
136
+ # File uploader for resume input
137
+ uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx", "doc"])
138
+
139
+ if uploaded_file is not None and api_key:
140
+ try:
141
+ # Authenticate with Google Gemini API
142
+ model = authenticate_gemini(api_key)
143
+ if model is None:
144
+ return
145
+
146
+ # Extract text from the uploaded resume
147
+ file_ext = uploaded_file.name.split('.')[-1].lower()
148
+ if file_ext == 'pdf':
149
+ resume_text = extract_text_from_pdf(uploaded_file)
150
+ elif file_ext in ['docx', 'doc']:
151
+ resume_text = extract_text_from_doc(uploaded_file)
152
+ else:
153
+ st.error("Unsupported file format.")
154
+ return
155
+
156
+ if not resume_text.strip():
157
+ st.error("The resume appears to be empty.")
158
+ return
159
+
160
+ # Process the resume
161
+ doc = nlp_spacy(resume_text)
162
+
163
+ # Extract information
164
+ companies = extract_orgs(resume_text)
165
+ summary = generate_summary(resume_text, model)
166
+ experience = extract_experience(doc)
167
+ phone = extract_phone(resume_text)
168
+ email = extract_email(resume_text)
169
+ colleges = extract_colleges(doc)
170
+ linkedin = extract_linkedin(resume_text)
171
+
172
+ # Display results
173
+ st.subheader("Extracted Information")
174
+ st.write(f"*Years of Experience:* {experience}")
175
+ st.write("*Companies Worked For:*")
176
+ st.write(", ".join(companies))
177
+ st.write(f"*Phone Number:* {phone}")
178
+ st.write(f"*Email ID:* {email}")
179
+ st.write("*Colleges Attended:*")
180
+ st.write(", ".join(colleges))
181
+ st.write(f"*LinkedIn ID:* {linkedin}")
182
+
183
+ st.subheader("Generated Summary")
184
+ st.write(summary)
185
+
186
+ except Exception as e:
187
+ st.error(f"Error during processing: {e}")
188
+
189
+ if __name__ == "__main__":
190
+ main()from pyngrok import ngrok
191
+
192
+ # Set your authtoken
193
+ ngrok.set_auth_token("2keP9BS91BCtRFtnf5Ss4tOpzq4_2c6463MYzXPqFM3a95gUM") # Replace YOUR_AUTHTOKEN
194
+
195
+ # Terminate any running ngrok processes (if any)
196
+ !pkill -f streamlit
197
+
198
+ # Run Streamlit in the background
199
+ # The 'port' option should be passed as a keyword argument to the 'ngrok.connect()' function.
200
+ public_url = ngrok.connect(8501)
201
+ print("Public URL:", public_url)
202
+
203
+ # Launch Streamlit
204
+ !streamlit run combined_resume_analyzer.py
205
+
206
+
207
+ from pyngrok import ngrok
208
+
209
+ # Set your authtoken
210
+ ngrok.set_auth_token("2keP9BS91BCtRFtnf5Ss4tOpzq4_2c6463MYzXPqFM3a95gUM") # Replace YOUR_AUTHTOKEN
211
+
212
+ # Terminate any running ngrok processes (if any)
213
+ !pkill -f streamlit
214
+
215
+ # Run Streamlit in the background
216
+ # The 'port' option should be passed as a keyword argument to the 'ngrok.connect()' function.
217
+ public_url = ngrok.connect(8501)
218
+ print("Public URL:", public_url)
219
+
220
+ # Launch Streamlit
221
+ !streamlit run combined_resume_analyzer.py