nafees369 commited on
Commit
3cf6a99
·
verified ·
1 Parent(s): 8447410

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -76
app.py CHANGED
@@ -1,105 +1,85 @@
1
 
2
  import gradio as gr
3
- from transformers import AutoTokenizer, AutoModelForTokenClassification
4
- import torch
5
- import fitz # PyMuPDF
6
 
7
- # Load the NER model and tokenizer
8
- model_name = "Ioana23/bert-finetuned-resumes-ner"
9
- model = AutoModelForTokenClassification.from_pretrained(model_name, from_tf=True)
10
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
11
 
12
- # Function to extract text from PDF
13
- def extract_text_from_pdf(file):
14
- doc = fitz.open(file)
15
  text = ""
16
  for page in doc:
17
  text += page.get_text()
18
  return text.strip()
19
 
20
- # Function to map common entity labels to custom labels
21
- def map_labels(label, label_map):
22
- for key, value in label_map.items():
23
- if label in value:
24
- return key
25
- return label
26
 
27
- # Define the function to process the input text and labels
28
  def process_text(file, labels):
29
  # Extract text from the PDF file
30
  text = extract_text_from_pdf(file.name)
31
-
32
- # Tokenize the text
33
- inputs = tokenizer(text.split(), return_tensors="pt", is_split_into_words=True, truncation=True, padding="max_length", max_length=512)
34
-
35
- # Make predictions
36
- with torch.no_grad():
37
- outputs = model(**inputs)
38
- predictions = torch.argmax(outputs.logits, dim=-1)
39
-
40
- # Custom label mapping (enhanced prediction)
41
  label_map = {
42
- "Name": ["B-PER", "I-PER"],
43
- "Organization": ["B-ORG", "I-ORG"],
44
- "Location": ["B-LOC", "I-LOC"],
45
- "Project": ["B-PROJECT", "I-PROJECT"],
46
- "Education": ["B-EDUCATION", "I-EDUCATION"],
47
  }
48
-
49
- # Prepare a dictionary to hold extracted information for each label
50
- extracted_info = {label.strip(): [] for label in labels.split(",")}
51
-
52
- current_word = ""
53
- last_label = None
54
-
55
- for i, pred in enumerate(predictions[0]):
56
- entity_label = model.config.id2label[pred.item()]
57
- word_piece = tokenizer.decode(inputs.input_ids[0][i]).strip()
58
-
59
- # Map entity labels to the custom labels provided by the user
60
- mapped_label = map_labels(entity_label, label_map)
61
-
62
- if word_piece.startswith("##"):
63
- current_word += word_piece[2:] # Append subword without ##
64
- else:
65
- if current_word and last_label in extracted_info:
66
- extracted_info[last_label].append(current_word)
67
- current_word = word_piece # Start new word
68
- last_label = mapped_label if mapped_label in extracted_info else None
69
-
70
- # If the current word is complete and matches the label, append it
71
- if last_label and mapped_label in extracted_info:
72
- extracted_info[mapped_label].append(current_word)
73
- current_word = "" # Reset current word after adding
74
-
75
- # Add the last word if it's valid
76
- if current_word and last_label in extracted_info:
77
- extracted_info[last_label].append(current_word)
78
-
79
- # Prepare the final output
80
  output = ""
81
- for label, words in extracted_info.items():
82
- if words:
83
- # Clean and join the words with a single space and remove extra spaces
84
- cleaned_words = ' '.join(words).replace(" ", " ") # Ensures correct spacing
85
- output += f"{label}: {cleaned_words}\n"
86
  else:
87
- output += f"{label}: No information found for this label.\n"
88
-
89
  return output.strip()
90
 
91
  # Create Gradio components
92
- file_input = gr.File(label="Upload a file")
93
- label_input = gr.Textbox(label="Enter labels (comma-separated)")
94
- output_text = gr.Textbox(label="Extracted information")
95
 
96
  # Create the Gradio interface
97
  iface = gr.Interface(
98
  fn=process_text,
99
  inputs=[file_input, label_input],
100
  outputs=output_text,
101
- title="NER with Custom Labels"
 
102
  )
103
 
104
- # Launch the interface
105
- iface.launch()
 
1
 
2
  import gradio as gr
3
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
4
+ import fitz # PyMuPDF for PDF handling
 
5
 
6
+ # Load a pre-trained NER model
7
+ model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
8
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
9
  tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+ ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
11
 
12
+ # Function to extract text from a PDF file
13
+ def extract_text_from_pdf(file_path):
14
+ doc = fitz.open(file_path)
15
  text = ""
16
  for page in doc:
17
  text += page.get_text()
18
  return text.strip()
19
 
20
+ # Function to map recognized entities to custom labels
21
+ def map_labels(entity_label, label_map):
22
+ for custom_label, ner_labels in label_map.items():
23
+ if entity_label in ner_labels:
24
+ return custom_label
25
+ return None
26
 
27
+ # Function to process the text and extract entities based on custom labels
28
  def process_text(file, labels):
29
  # Extract text from the PDF file
30
  text = extract_text_from_pdf(file.name)
31
+
32
+ # Define the custom label mapping
 
 
 
 
 
 
 
 
33
  label_map = {
34
+ "Name": ["PER"],
35
+ "Organization": ["ORG"],
36
+ "Location": ["LOC"],
37
+ "Project": ["MISC"],
38
+ "Education": ["MISC"],
39
  }
40
+
41
+ # Split the custom labels provided by the user
42
+ requested_labels = [label.strip() for label in labels.split(",")]
43
+
44
+ # Perform NER on the extracted text
45
+ ner_results = ner_pipeline(text)
46
+
47
+ # Initialize a dictionary to hold the extracted information
48
+ extracted_info = {label: [] for label in requested_labels}
49
+
50
+ # Process the NER results
51
+ for entity in ner_results:
52
+ # Remove subword tokens (##) and map the entity to the custom labels
53
+ entity_text = entity['word'].replace("##", "")
54
+ mapped_label = map_labels(entity['entity_group'], label_map)
55
+
56
+ # If the mapped label is in the requested labels, store the entity
57
+ if mapped_label in extracted_info:
58
+ extracted_info[mapped_label].append(entity_text)
59
+
60
+ # Format the output
 
 
 
 
 
 
 
 
 
 
 
61
  output = ""
62
+ for label, entities in extracted_info.items():
63
+ if entities:
64
+ output += f"{label}: {', '.join(sorted(set(entities)))}\n"
 
 
65
  else:
66
+ output += f"{label}: No information found.\n"
67
+
68
  return output.strip()
69
 
70
  # Create Gradio components
71
+ file_input = gr.File(label="Upload a PDF file")
72
+ label_input = gr.Textbox(label="Enter labels to extract (comma-separated)")
73
+ output_text = gr.Textbox(label="Extracted Information")
74
 
75
  # Create the Gradio interface
76
  iface = gr.Interface(
77
  fn=process_text,
78
  inputs=[file_input, label_input],
79
  outputs=output_text,
80
+ title="NER with Custom Labels from PDF",
81
+ description="Upload a PDF file and extract entities based on custom labels."
82
  )
83
 
84
+ # Launch the Gradio interface
85
+ iface.launch()