mkoot007 commited on
Commit
73f7377
·
1 Parent(s): 80de488

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -60
app.py CHANGED
@@ -1,63 +1,11 @@
1
- import gradio as gr
2
- import re
3
- from docx import Document
4
- from PyPDF2 import PdfReader # Use PdfReader from PyPDF2
5
 
6
- # Function to extract text from a PDF file
7
- def extract_text_from_pdf(pdf_file):
8
- text = ""
9
- pdf = PdfReader(pdf_file)
10
- for page in pdf.pages:
11
- text += page.extract_text()
12
- return text
13
 
14
- # Function to extract text from a DOCX file
15
- def extract_text_from_docx(docx_file):
16
- doc = Document(docx_file)
17
- text = "\n".join([para.text for para in doc.paragraphs])
18
- return text
19
- def extract_text_from_pdf(pdf_file):
20
- text = ""
21
- pdf = PdfReader(pdf_file)
22
-
23
- if not pdf.pages:
24
- raise ValueError("The PDF file is empty.")
25
-
26
- for page in pdf.pages:
27
- text += page.extract_text()
28
- return text
29
- # Function to extract information from a resume
30
- def extract_info_from_resume(resume_path):
31
- if resume_path.name.endswith('.pdf'):
32
- text = extract_text_from_pdf(resume_path)
33
- elif resume_path.name.endswith('.docx'):
34
- text = extract_text_from_docx(resume_path)
35
- else:
36
- raise ValueError("Unsupported file format. Only PDF and DOCX are supported.")
37
 
38
- # Define regular expressions to extract information
39
- name_pattern = r"([A-Z][a-z]+(?: [A-Z][a-z]+)+)"
40
- email_pattern = r"[\w\.-]+@[\w\.-]+"
41
- phone_pattern = r"(\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4})"
42
-
43
- name = re.search(name_pattern, text)
44
- email = re.search(email_pattern, text)
45
- phone = re.search(phone_pattern, text)
46
-
47
- extracted_info = {
48
- "Name": name.group() if name else "Name not found",
49
- "Email": email.group() if email else "Email not found",
50
- "Phone": phone.group() if phone else "Phone number not found",
51
- }
52
-
53
- return extracted_info
54
-
55
- # Define a Gradio interface
56
- iface = gr.Interface(
57
- fn=extract_info_from_resume,
58
- inputs=gr.inputs.File(type="file"),
59
- outputs="json"
60
- )
61
-
62
- # Deploy the Gradio interface
63
- iface.launch(share=True)
 
1
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
2
+ from transformers import pipeline
 
 
3
 
4
+ tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
5
+ model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
 
 
 
 
 
6
 
7
+ nlp = pipeline("ner", model=model, tokenizer=tokenizer)
8
+ example = "My name is Wolfgang and I live in Berlin"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ ner_results = nlp(example)
11
+ print(ner_results)