mrfirdauss commited on
Commit
3518d42
·
verified ·
1 Parent(s): 997fd3d
Files changed (1) hide show
  1. app.py +163 -0
app.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ import re
4
+ import numpy as np
5
+ from typing import List, Optional, Any
6
+ from transformers import RobertaTokenizerFast, AutoModelForTokenClassification
7
+
8
+ tokenizer = RobertaTokenizerFast.from_pretrained("mrfirdauss/robert-base-finetuned-cv")
9
+ model = AutoModelForTokenClassification.from_pretrained("mrfirdauss/robert-base-finetuned-cv")
10
+ id2label = {0: 'O',
11
+ 1: 'B-NAME',
12
+ 3: 'B-NATION',
13
+ 5: 'B-EMAIL',
14
+ 7: 'B-URL',
15
+ 9: 'B-CAMPUS',
16
+ 11: 'B-MAJOR',
17
+ 13: 'B-COMPANY',
18
+ 15: 'B-DESIGNATION',
19
+ 17: 'B-GPA',
20
+ 19: 'B-PHONE NUMBER',
21
+ 21: 'B-ACHIEVEMENT',
22
+ 23: 'B-EXPERIENCES DESC',
23
+ 25: 'B-SKILLS',
24
+ 27: 'B-PROJECTS',
25
+ 2: 'I-NAME',
26
+ 4: 'I-NATION',
27
+ 6: 'I-EMAIL',
28
+ 8: 'I-URL',
29
+ 10: 'I-CAMPUS',
30
+ 12: 'I-MAJOR',
31
+ 14: 'I-COMPANY',
32
+ 16: 'I-DESIGNATION',
33
+ 18: 'I-GPA',
34
+ 20: 'I-PHONE NUMBER',
35
+ 22: 'I-ACHIEVEMENT',
36
+ 24: 'I-EXPERIENCES DESC',
37
+ 26: 'I-SKILLS',
38
+ 28: 'I-PROJECTS'}
39
+
40
+ def merge_subwords(tokens, labels):
41
+ merged_tokens = []
42
+ merged_labels = []
43
+
44
+ current_token = ""
45
+ current_label = ""
46
+
47
+ for token, label in zip(tokens, labels):
48
+ if token.startswith("Ġ"):
49
+ if current_token:
50
+ # Append the accumulated subwords as a new token and label
51
+ merged_tokens.append(current_token)
52
+ merged_labels.append(current_label)
53
+ # Start a new token and label
54
+ current_token = token[1:] # Remove the 'Ġ'
55
+ current_label = label
56
+ else:
57
+ # Continue accumulating subwords into the current token
58
+ current_token += token
59
+
60
+ # Append the last token and label
61
+ if current_token:
62
+ merged_tokens.append(current_token)
63
+ merged_labels.append(current_label)
64
+
65
+ return merged_tokens, merged_labels
66
+
67
+ def chunked_inference(text, tokenizer, model, max_length=512):
68
+ # Tokenize the text with truncation=False to get the full list of tokens
69
+ tok = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
70
+ tokens = tokenizer.tokenize(tok, is_split_into_words=True)
71
+ # Initialize containers for tokenized inputs
72
+ input_ids_chunks = []
73
+ # Decode and print each token
74
+ print(tokens)
75
+ # Create chunks of tokens that fit within the model's maximum input size
76
+ for i in range(0, len(tokens), max_length - 2): # -2 accounts for special tokens [CLS] and [SEP]
77
+ chunk = tokens[i:i + max_length - 2]
78
+ # Encode the chunks. Add special tokens via the tokenizer
79
+ chunk_ids = tokenizer.convert_tokens_to_ids(chunk)
80
+ chunk_ids = tokenizer.build_inputs_with_special_tokens(chunk_ids)
81
+ input_ids_chunks.append(chunk_ids)
82
+
83
+ # Convert list of token ids into a tensor
84
+ input_ids_chunks = [torch.tensor(chunk_ids).unsqueeze(0) for chunk_ids in input_ids_chunks]
85
+
86
+ # Predictions container
87
+ predictions = []
88
+
89
+ # Process each chunk
90
+ for input_ids in input_ids_chunks:
91
+ attention_mask = torch.ones_like(input_ids) # Create an attention mask for the inputs
92
+ output = model(input_ids, attention_mask=attention_mask)
93
+ logits = output[0] if isinstance(output, tuple) else output.logits
94
+ predictions_chunk = torch.argmax(logits, dim=-1).squeeze(0)
95
+ predictions.append(predictions_chunk[1:-1])
96
+
97
+ # Optionally, you can convert predictions to labels here
98
+ # Flatten the list of tensors into one long tensor for label mapping
99
+ predictions = torch.cat(predictions, dim=0)
100
+ predicted_labels = [id2label[pred.item()] for pred in predictions]
101
+ return merge_subwords(tokens,predicted_labels)
102
+
103
+ def process_tokens(tokens, tag_prefix):
104
+ # Process tokens to extract entities based on the tag prefix
105
+ entities = []
106
+ current_entity = {}
107
+ for token, tag in tokens:
108
+ if tag.startswith('B-') and tag.endswith(tag_prefix):
109
+ # Start a new entity
110
+ if current_entity:
111
+ # Append the current entity before starting a new one
112
+ entities.append(current_entity)
113
+ current_entity = {}
114
+ current_entity['text'] = token
115
+ current_entity['type'] = tag
116
+ elif tag.startswith('I-') and tag.endswith(tag_prefix) and current_entity:
117
+ # Continue the current entity
118
+ current_entity['text'] += ' ' + token
119
+ # Append the last entity if there is one
120
+ if current_entity:
121
+ entities.append(current_entity)
122
+ return entities
123
+
124
+ def predict(text):
125
+ tokens, predictions = chunked_inference(text, tokenizer, model)
126
+ data = list(zip(tokens, predictions))
127
+ profile = {
128
+ "name": "",
129
+ "skills": [],
130
+ "experiences": [],
131
+ "educations": []
132
+ }
133
+ profile['name'] = ' '.join([t for t, p in data if p.endswith('NAME')])
134
+ for skills in process_tokens(data, 'SKILLS'):
135
+ profile['skills'].append(skills['text'])
136
+ # Process experiences and education
137
+ for designation, company, experience_desc in zip(process_tokens(data, 'DESIGNATION'),process_tokens(data, 'CAMPUS'),process_tokens(data, 'EXPERIENCES DESC') ):
138
+ profile['experiences'].append({
139
+ "start": None,
140
+ "end": None,
141
+ "designation": designation['text'],
142
+ "company": company['text'], # To be filled in similarly
143
+ "experience_description": experience_desc['text'] # To be filled in similarly
144
+ })
145
+ for major, gpa, campus in zip(process_tokens(data, 'MAJOR'), process_tokens(data, 'GPA'), process_tokens(data, 'CAMPUS')):
146
+ profile['educations'].append({
147
+ "start": None,
148
+ "end": None,
149
+ "major": major['text'],
150
+ "campus": campus['text'], # To be filled in similarly
151
+ "GPA": gpa['text'] # To be filled in similarly
152
+ })
153
+
154
+ return profile
155
+ gradio_app = gr.Interface(
156
+ predict,
157
+ inputs=gr.Textbox(label="Enter Text"),
158
+ outputs=gr.JSON(label="Token Classifications"),
159
+ title="Hot Dog? Or Not?",
160
+ )
161
+
162
+ if __name__ == "__main__":
163
+ gradio_app.launch()