pavanhitloop commited on
Commit
c580961
·
1 Parent(s): 6c7b9e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -67
app.py CHANGED
@@ -1,10 +1,10 @@
1
  import os, sys
2
- from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, MBartForConditionalGeneration
3
- import torch
4
  import gradio as gr
5
  import requests
6
  import json
7
- from huggingface_hub import login
8
 
9
 
10
  class LTRC_Translation_API():
@@ -37,94 +37,94 @@ class LTRC_Translation_API():
37
  return ''
38
 
39
 
40
- class Headline_Generation():
41
- def __init__(self, model_name = "lokeshmadasu42/sample"):
42
- self.model_name = model_name
43
 
44
- self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
45
- self.tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False, use_fast=False, keep_accents=True)
46
- self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
47
- self.model.to(self.device)
48
- self.model.eval()
49
 
50
- self.bos_id = self.tokenizer._convert_token_to_id_with_added_voc("<s>")
51
- self.eos_id = self.tokenizer._convert_token_to_id_with_added_voc("</s>")
52
- self.pad_id = self.tokenizer._convert_token_to_id_with_added_voc("<pad>")
53
 
54
- self.lang_map = {'as': '<2as>', 'bn': '<2bn>', 'en': '<2en>', 'gu': '<2gu>', 'hi': '<2hi>', 'kn': '<2kn>', 'ml': '<2ml>', 'mr': '<2mr>', 'or': '<2or>', 'pa': '<2pa>', 'ta': '<2ta>', 'te': '<2te>'}
55
 
56
- print("Headline Generation model loaded...!")
57
 
58
 
59
- def get_headline(self, text, lang_id):
60
 
61
- inp = self.tokenizer(text, add_special_tokens=False, return_tensors="pt", padding=True).to(self.device)
62
- inp = inp['input_ids']
63
 
64
- lang_code = self.lang_map.get(lang_id, '')
65
 
66
- text = text + "</s> " + lang_code
67
- # print("Text: ", text)
68
 
69
- model_output = self.model.generate(
70
- inp,
71
- use_cache=True,
72
- num_beams=5,
73
- max_length=32,
74
- min_length=1,
75
- early_stopping=True,
76
- pad_token_id = self.pad_id,
77
- bos_token_id = self.bos_id,
78
- eos_token_id = self.eos_id,
79
- decoder_start_token_id = self.tokenizer._convert_token_to_id_with_added_voc(lang_code)
80
- )
81
 
82
- decoded_output = self.tokenizer.decode(
83
- model_output[0],
84
- skip_special_tokens=True,
85
- clean_up_tokenization_spaces=False
86
- )
87
 
88
- return decoded_output
89
 
90
 
91
- class Summarization():
92
- def __init__(self, model_name = "ashokurlana/mBART-TeSum"):
93
- self.model_name = model_name
94
 
95
- self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
96
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
97
- self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
98
- self.model.to(self.device)
99
- self.model.eval()
100
 
101
- self.lang_map = {'te': 'te_IN', 'en': 'en_XX'}
102
 
103
- print("Summarization model loaded...!")
104
 
105
 
106
- def get_summary(self, text, lang_id):
107
 
108
- inp = self.tokenizer([text], add_special_tokens=False, return_tensors="pt", max_length = 1024).to(self.device)
109
- inp = inp['input_ids']
110
 
111
- lang_code = self.lang_map.get(lang_id, '')
112
 
113
- model_output = self.model.generate(
114
- inp,
115
- use_cache=True,
116
- num_beams=5,
117
- max_length=256,
118
- early_stopping=True
119
- )
120
 
121
- decoded_output = [self.tokenizer.decode(
122
- summ_id,
123
- skip_special_tokens=True,
124
- clean_up_tokenization_spaces=False
125
- ) for summ_id in model_output]
126
 
127
- return " ".join(decoded_output)
128
 
129
 
130
  def get_prediction(text, lang_id, translate = False):
 
1
  import os, sys
2
+ # from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, MBartForConditionalGeneration
3
+ # import torch
4
  import gradio as gr
5
  import requests
6
  import json
7
+ # from huggingface_hub import login
8
 
9
 
10
  class LTRC_Translation_API():
 
37
  return ''
38
 
39
 
40
+ # class Headline_Generation():
41
+ # def __init__(self, model_name = "lokeshmadasu42/sample"):
42
+ # self.model_name = model_name
43
 
44
+ # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
45
+ # self.tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False, use_fast=False, keep_accents=True)
46
+ # self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
47
+ # self.model.to(self.device)
48
+ # self.model.eval()
49
 
50
+ # self.bos_id = self.tokenizer._convert_token_to_id_with_added_voc("<s>")
51
+ # self.eos_id = self.tokenizer._convert_token_to_id_with_added_voc("</s>")
52
+ # self.pad_id = self.tokenizer._convert_token_to_id_with_added_voc("<pad>")
53
 
54
+ # self.lang_map = {'as': '<2as>', 'bn': '<2bn>', 'en': '<2en>', 'gu': '<2gu>', 'hi': '<2hi>', 'kn': '<2kn>', 'ml': '<2ml>', 'mr': '<2mr>', 'or': '<2or>', 'pa': '<2pa>', 'ta': '<2ta>', 'te': '<2te>'}
55
 
56
+ # print("Headline Generation model loaded...!")
57
 
58
 
59
+ # def get_headline(self, text, lang_id):
60
 
61
+ # inp = self.tokenizer(text, add_special_tokens=False, return_tensors="pt", padding=True).to(self.device)
62
+ # inp = inp['input_ids']
63
 
64
+ # lang_code = self.lang_map.get(lang_id, '')
65
 
66
+ # text = text + "</s> " + lang_code
67
+ # # print("Text: ", text)
68
 
69
+ # model_output = self.model.generate(
70
+ # inp,
71
+ # use_cache=True,
72
+ # num_beams=5,
73
+ # max_length=32,
74
+ # min_length=1,
75
+ # early_stopping=True,
76
+ # pad_token_id = self.pad_id,
77
+ # bos_token_id = self.bos_id,
78
+ # eos_token_id = self.eos_id,
79
+ # decoder_start_token_id = self.tokenizer._convert_token_to_id_with_added_voc(lang_code)
80
+ # )
81
 
82
+ # decoded_output = self.tokenizer.decode(
83
+ # model_output[0],
84
+ # skip_special_tokens=True,
85
+ # clean_up_tokenization_spaces=False
86
+ # )
87
 
88
+ # return decoded_output
89
 
90
 
91
+ # class Summarization():
92
+ # def __init__(self, model_name = "ashokurlana/mBART-TeSum"):
93
+ # self.model_name = model_name
94
 
95
+ # self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
96
+ # self.tokenizer = AutoTokenizer.from_pretrained(model_name)
97
+ # self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
98
+ # self.model.to(self.device)
99
+ # self.model.eval()
100
 
101
+ # self.lang_map = {'te': 'te_IN', 'en': 'en_XX'}
102
 
103
+ # print("Summarization model loaded...!")
104
 
105
 
106
+ # def get_summary(self, text, lang_id):
107
 
108
+ # inp = self.tokenizer([text], add_special_tokens=False, return_tensors="pt", max_length = 1024).to(self.device)
109
+ # inp = inp['input_ids']
110
 
111
+ # lang_code = self.lang_map.get(lang_id, '')
112
 
113
+ # model_output = self.model.generate(
114
+ # inp,
115
+ # use_cache=True,
116
+ # num_beams=5,
117
+ # max_length=256,
118
+ # early_stopping=True
119
+ # )
120
 
121
+ # decoded_output = [self.tokenizer.decode(
122
+ # summ_id,
123
+ # skip_special_tokens=True,
124
+ # clean_up_tokenization_spaces=False
125
+ # ) for summ_id in model_output]
126
 
127
+ # return " ".join(decoded_output)
128
 
129
 
130
  def get_prediction(text, lang_id, translate = False):