Spaces:

hitloop
/

News_Te_Summ_Head

Build error

App Files Files Community

pavanhitloop commited on Nov 10, 2023

Commit

c491574

1 Parent(s): f77573d

Create app.py

Browse files

Files changed (1) hide show

app.py +190 -0

app.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import os, sys
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, MBartForConditionalGeneration
+import torch
+import gradio as gr
+import requests
+import json
+class LTRC_Translation_API():
+    def __init__(self, url = 'https://ssmt.iiit.ac.in/onemt', src_lang = 'eng', tgt_lang = 'te'):
+        self.lang_map = {'te': 'tel', 'en': 'eng', 'ta': 'tam', 'ml': 'mal', 'mr': 'mar', 'kn': 'kan', 'hi': 'hin'}
+        self.url = url
+        self.headers = {
+            'Content-Type': 'application/json',
+            'Accept': 'application/json'
+        }
+        lang = self.lang_map.get(tgt_lang, 'te')
+        self.src_lang = src_lang
+        self.tgt_lang = lang
+    def translate(self, text):
+        try:
+            data = {'text': text, 'source_language': self.src_lang, 'target_language': self.tgt_lang}
+            response = requests.post(self.url, headers = self.headers, json = data)
+            translated_text = json.loads(response.text).get('data', '')
+            return translated_text
+        except Exception as e:
+            print("Exception: ", e)
+            return ''
+class Headline_Generation():
+	def __init__(self, model_name = "ai4bharat/MultiIndicHeadlineGenerationSS"):
+		self.model_name = model_name
+		self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+		self.tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False, use_fast=False, keep_accents=True)
+		self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+		self.model.to(self.device)
+		self.model.eval()
+		self.bos_id = self.tokenizer._convert_token_to_id_with_added_voc("<s>")
+		self.eos_id = self.tokenizer._convert_token_to_id_with_added_voc("</s>")
+		self.pad_id = self.tokenizer._convert_token_to_id_with_added_voc("<pad>")
+		self.lang_map = {'as': '<2as>', 'bn': '<2bn>', 'en': '<2en>', 'gu': '<2gu>', 'hi': '<2hi>', 'kn': '<2kn>', 'ml': '<2ml>', 'mr': '<2mr>', 'or': '<2or>', 'pa': '<2pa>', 'ta': '<2ta>', 'te': '<2te>'}
+		print("Headline Generation model loaded...!")
+	def get_headline(self, text, lang_id):
+		inp = self.tokenizer(text, add_special_tokens=False, return_tensors="pt", padding=True).to(self.device)
+		inp = inp['input_ids']
+		lang_code = self.lang_map.get(lang_id, '')
+		text = text + "</s> " + lang_code
+		# print("Text: ", text)
+		model_output = self.model.generate(
+			inp,
+			use_cache=True,
+			num_beams=5,
+			max_length=32,
+			min_length=1,
+			early_stopping=True,
+			pad_token_id = self.pad_id,
+			bos_token_id = self.bos_id,
+			eos_token_id = self.eos_id,
+			decoder_start_token_id = self.tokenizer._convert_token_to_id_with_added_voc(lang_code)
+		)
+		decoded_output = self.tokenizer.decode(
+			model_output[0],
+			skip_special_tokens=True,
+			clean_up_tokenization_spaces=False
+		)
+		return decoded_output
+class Summarization():
+	def __init__(self, model_name = "ai4bharat/MultiIndicSentenceSummarizationSS"):
+		self.model_name = model_name
+		self.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
+		self.tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False, use_fast=False, keep_accents=True)
+		self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+		self.model.to(self.device)
+		self.model.eval()
+		self.bos_id = self.tokenizer._convert_token_to_id_with_added_voc("<s>")
+		self.eos_id = self.tokenizer._convert_token_to_id_with_added_voc("</s>")
+		self.pad_id = self.tokenizer._convert_token_to_id_with_added_voc("<pad>")
+		self.lang_map = {'as': '<2as>', 'bn': '<2bn>', 'en': '<2en>', 'gu': '<2gu>', 'hi': '<2hi>', 'kn': '<2kn>', 'ml': '<2ml>', 'mr': '<2mr>', 'or': '<2or>', 'pa': '<2pa>', 'ta': '<2ta>', 'te': '<2te>'}
+		print("Summarization model loaded...!")
+	def get_summary(self, text, lang_id):
+		inp = self.tokenizer(text, add_special_tokens=False, return_tensors="pt", padding=True).to(self.device)
+		inp = inp['input_ids']
+		lang_code = self.lang_map.get(lang_id, '')
+		text = text + "</s> " + lang_code
+		# print("Text: ", text)
+		model_output = self.model.generate(
+			inp,
+			use_cache=True,
+			num_beams=5,
+			max_length=32,
+			min_length=1,
+			early_stopping=True,
+			pad_token_id = self.pad_id,
+			bos_token_id = self.bos_id,
+			eos_token_id = self.eos_id,
+			decoder_start_token_id = self.tokenizer._convert_token_to_id_with_added_voc(lang_code)
+		)
+		decoded_output = self.tokenizer.decode(
+			model_output[0],
+			skip_special_tokens=True,
+			clean_up_tokenization_spaces=False
+		)
+		return decoded_output
+def get_prediction(text, lang_id, translate = False):
+    # if len(sys.argv)<3:
+    # 	print("Usage: python app.py <text_file_path> <lang_id>")
+    # 	print("Text file should contain the article news")
+    # 	exit()
+    # txt_path = sys.argv[1]
+    # lang_id = sys.argv[2]
+    # if not os.path.exists(txt_path):
+    # 	print("Path: {} do not exists".format(txt_path))
+    # 	exit()
+    # text = ''
+    # with open(txt_path, 'r', encoding='utf-8') as fp:
+    # 	text = fp.read().strip()
+    headline_generator = Headline_Generation()
+    summarizer = Summarization()
+    if translate == True:
+        translator = LTRC_Translation_API(tgt_lang = lang_id)
+        text = translator.translate(text)
+    headline = headline_generator.get_headline(text, lang_id)
+    summary = summarizer.get_summary(text, lang_id)
+    # print("Article: ", text)
+    # print("Summary: ", summary)
+    # print("Headline: ", headline)
+    # return "Headline: " + headline + "\nSummary: " + summary
+    return [text, summary, headline]
+interface = gr.Interface(
+    get_prediction,
+    inputs=[
+        gr.Textbox(lines = 8, label = "News Article Text", info = "Provide the news article text here. Check the `Translate` if the source language is english."),
+        gr.Dropdown(
+            ['as', 'bn', 'en', 'gu', 'hi', 'kn', 'ml', 'mr', 'or', 'pa', 'ta', 'te'], label="Language code", info="select the target language code"
+        ),
+        gr.Checkbox(label="Translate", info="Is translation required?")
+    ],
+    outputs=[
+        gr.Textbox(lines = 8, label = "Source Article Text", info = "Source article text (if `Translate` is enabled then the source will be translated to target language)"),
+        gr.Textbox(lines = 4, label = "Summary", info = "Summary of the given article (translated if `Translate` is enabled)"),
+        gr.Textbox(lines = 2, label = "Headline", info = "Generated headline of the given article (translated if `Translate` is enabled)")
+    ]
+)
+interface.launch(share=True)