Spaces:

nafees369
/

NER

Sleeping

App Files Files Community

nafees369 commited on Sep 1, 2024

Commit

3cf6a99

verified ·

1 Parent(s): 8447410

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -76

app.py CHANGED Viewed

@@ -1,105 +1,85 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForTokenClassification
-import torch
-import fitz  # PyMuPDF
-# Load the NER model and tokenizer
-model_name = "Ioana23/bert-finetuned-resumes-ner"
-model = AutoModelForTokenClassification.from_pretrained(model_name, from_tf=True)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-# Function to extract text from PDF
-def extract_text_from_pdf(file):
-    doc = fitz.open(file)
     text = ""
     for page in doc:
         text += page.get_text()
     return text.strip()
-# Function to map common entity labels to custom labels
-def map_labels(label, label_map):
-    for key, value in label_map.items():
-        if label in value:
-            return key
-    return label
-# Define the function to process the input text and labels
 def process_text(file, labels):
     # Extract text from the PDF file
     text = extract_text_from_pdf(file.name)
-    # Tokenize the text
-    inputs = tokenizer(text.split(), return_tensors="pt", is_split_into_words=True, truncation=True, padding="max_length", max_length=512)
-    # Make predictions
-    with torch.no_grad():
-        outputs = model(**inputs)
-        predictions = torch.argmax(outputs.logits, dim=-1)
-    # Custom label mapping (enhanced prediction)
     label_map = {
-        "Name": ["B-PER", "I-PER"],
-        "Organization": ["B-ORG", "I-ORG"],
-        "Location": ["B-LOC", "I-LOC"],
-        "Project": ["B-PROJECT", "I-PROJECT"],
-        "Education": ["B-EDUCATION", "I-EDUCATION"],
     }
-    # Prepare a dictionary to hold extracted information for each label
-    extracted_info = {label.strip(): [] for label in labels.split(",")}
-    current_word = ""
-    last_label = None
-    for i, pred in enumerate(predictions[0]):
-        entity_label = model.config.id2label[pred.item()]
-        word_piece = tokenizer.decode(inputs.input_ids[0][i]).strip()
-        # Map entity labels to the custom labels provided by the user
-        mapped_label = map_labels(entity_label, label_map)
-        if word_piece.startswith("##"):
-            current_word += word_piece[2:]  # Append subword without ##
-        else:
-            if current_word and last_label in extracted_info:
-                extracted_info[last_label].append(current_word)
-            current_word = word_piece  # Start new word
-            last_label = mapped_label if mapped_label in extracted_info else None
-        # If the current word is complete and matches the label, append it
-        if last_label and mapped_label in extracted_info:
-            extracted_info[mapped_label].append(current_word)
-            current_word = ""  # Reset current word after adding
-    # Add the last word if it's valid
-    if current_word and last_label in extracted_info:
-        extracted_info[last_label].append(current_word)
-    # Prepare the final output
     output = ""
-    for label, words in extracted_info.items():
-        if words:
-            # Clean and join the words with a single space and remove extra spaces
-            cleaned_words = ' '.join(words).replace("  ", " ")  # Ensures correct spacing
-            output += f"{label}: {cleaned_words}\n"
         else:
-            output += f"{label}: No information found for this label.\n"
     return output.strip()
 # Create Gradio components
-file_input = gr.File(label="Upload a file")
-label_input = gr.Textbox(label="Enter labels (comma-separated)")
-output_text = gr.Textbox(label="Extracted information")
 # Create the Gradio interface
 iface = gr.Interface(
     fn=process_text,
     inputs=[file_input, label_input],
     outputs=output_text,
-    title="NER with Custom Labels"
 )
-# Launch the interface
-iface.launch()

 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+import fitz  # PyMuPDF for PDF handling
+# Load a pre-trained NER model
+model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
+model = AutoModelForTokenClassification.from_pretrained(model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+# Function to extract text from a PDF file
+def extract_text_from_pdf(file_path):
+    doc = fitz.open(file_path)
     text = ""
     for page in doc:
         text += page.get_text()
     return text.strip()
+# Function to map recognized entities to custom labels
+def map_labels(entity_label, label_map):
+    for custom_label, ner_labels in label_map.items():
+        if entity_label in ner_labels:
+            return custom_label
+    return None
+# Function to process the text and extract entities based on custom labels
 def process_text(file, labels):
     # Extract text from the PDF file
     text = extract_text_from_pdf(file.name)
+    # Define the custom label mapping
     label_map = {
+        "Name": ["PER"],
+        "Organization": ["ORG"],
+        "Location": ["LOC"],
+        "Project": ["MISC"],
+        "Education": ["MISC"],
     }
+    # Split the custom labels provided by the user
+    requested_labels = [label.strip() for label in labels.split(",")]
+    # Perform NER on the extracted text
+    ner_results = ner_pipeline(text)
+    # Initialize a dictionary to hold the extracted information
+    extracted_info = {label: [] for label in requested_labels}
+    # Process the NER results
+    for entity in ner_results:
+        # Remove subword tokens (##) and map the entity to the custom labels
+        entity_text = entity['word'].replace("##", "")
+        mapped_label = map_labels(entity['entity_group'], label_map)
+        # If the mapped label is in the requested labels, store the entity
+        if mapped_label in extracted_info:
+            extracted_info[mapped_label].append(entity_text)
+    # Format the output
     output = ""
+    for label, entities in extracted_info.items():
+        if entities:
+            output += f"{label}: {', '.join(sorted(set(entities)))}\n"
         else:
+            output += f"{label}: No information found.\n"
     return output.strip()
 # Create Gradio components
+file_input = gr.File(label="Upload a PDF file")
+label_input = gr.Textbox(label="Enter labels to extract (comma-separated)")
+output_text = gr.Textbox(label="Extracted Information")
 # Create the Gradio interface
 iface = gr.Interface(
     fn=process_text,
     inputs=[file_input, label_input],
     outputs=output_text,
+    title="NER with Custom Labels from PDF",
+    description="Upload a PDF file and extract entities based on custom labels."
 )
+# Launch the Gradio interface
+iface.launch()