SMD00 commited on
Commit
055dfd2
·
1 Parent(s): df0e9bd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +170 -0
app.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PIL import Image
3
+ import os
4
+ import pytesseract
5
+ import torch
6
+ import numpy as np
7
+ import nltk
8
+ nltk.download('stopwords')
9
+ nltk.download('punkt')
10
+ from nltk.corpus import stopwords
11
+ from nltk.tokenize import word_tokenize, sent_tokenize
12
+ from nltk.cluster.util import cosine_distance
13
+ import networkx as nx
14
+ from transformers import pipeline
15
+
16
+
17
+ if torch.cuda.is_available():
18
+ device = torch.device("cuda")
19
+ else:
20
+ device = torch.device("cpu")
21
+
22
+
23
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
24
+
25
+ def read(filepath):
26
+ return pytesseract.image_to_string(Image.open(filepath))
27
+
28
+ def clean_text(text):
29
+ article = text.split(".")
30
+ article=[sentence for sentence in article if sentence!=""]
31
+ # print(article)
32
+
33
+ sentences = []
34
+
35
+ for sentence in article:
36
+ #print(sentence)
37
+ sentence=sentence.replace(",", " , ").replace("'", " ' ").split(" ")
38
+ #sentence=sentence.replace("[^a-zA-Z]", " ").split(" ")
39
+ sentence=[word for word in sentence if word!=""]
40
+ sentences.append(sentence)
41
+
42
+ return sentences
43
+
44
+ def sentence_similarity(sent1, sent2, stopwords): #Creating words in sentences to one hot encoding and then finding cosine distance between the vectors inorder to measure closeness
45
+
46
+ if stopwords is None:
47
+ stopwords = []
48
+
49
+ sent1 = [w.lower() for w in sent1]
50
+ sent2 = [w.lower() for w in sent2]
51
+
52
+ all_words = list(set(sent1 + sent2))
53
+
54
+ vector1 = [0] * len(all_words)
55
+ vector2 = [0] * len(all_words)
56
+
57
+ # build the vector for the first sentence
58
+ for w in sent1:
59
+ if w in stopwords:
60
+ continue
61
+ vector1[all_words.index(w)] += 1
62
+
63
+ # build the vector for the second sentence
64
+ for w in sent2:
65
+ if w in stopwords:
66
+ continue
67
+ vector2[all_words.index(w)] += 1
68
+
69
+ return 1 - cosine_distance(vector1, vector2)
70
+
71
+
72
+ def build_similarity_matrix(sentences, stop_words):
73
+
74
+ # Create an empty similarity matrix
75
+ similarity_matrix = np.zeros((len(sentences), len(sentences)))
76
+
77
+ for idx1 in range(len(sentences)):
78
+ for idx2 in range(len(sentences)):
79
+ if idx1 == idx2: #ignore if both are same sentences
80
+ continue
81
+ similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
82
+
83
+ return similarity_matrix
84
+
85
+ def sentences(text, top_n='2'):
86
+
87
+ if top_n== None or top_n=="":
88
+ top_n=2
89
+ top_n=int(top_n)
90
+ # Step 1 - Clean text to generate sentences
91
+
92
+ sentences=clean_text(text)
93
+ stop_words = stopwords.words('english')
94
+ stop_words.append(".")
95
+ stop_words.append(",")
96
+ summarize_text = []
97
+
98
+ # Step 2 - Generate Similary Martix across sentences
99
+
100
+ sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)
101
+ # print(sentence_similarity_martix)
102
+
103
+ # Step 3 - Rank sentences in similarity martix
104
+
105
+ sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
106
+ # print(sentence_similarity_graph)
107
+
108
+ scores = nx.pagerank(sentence_similarity_graph)
109
+ # print(scores)
110
+
111
+ # Step 4 - Sort the rank and pick top sentences
112
+
113
+ ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True) #Sorting the scores in decending order
114
+ # print("Indexes of top ranked_sentence order are ", ranked_sentence)
115
+
116
+ for i in range(top_n):
117
+ ranked_sentence[i][1][0]=ranked_sentence[i][1][0].capitalize() #Capitalising 1st letter of sentence
118
+ # print(ranked_sentence[i][1][0])
119
+ summarize_text.append(" ".join(ranked_sentence[i][1]))
120
+
121
+ # Step 5 - Offcourse, output the summarized text
122
+
123
+ extractive_summarized=". ".join(summarize_text).replace(" , ",", ").replace(" ' ","'") + "."
124
+ return extractive_summarized
125
+
126
+ def important_sentences(filepath, no_of_sentences=5):
127
+ extractedInformation=read(filepath)
128
+ extractedInformation=' '.join(extractedInformation.split('\n'))
129
+ extractive_summary=sentences(extractedInformation, no_of_sentences)
130
+ text=""
131
+ for index,sent in enumerate(extractive_summary.split(".")):
132
+ if sent!='':text+=str(index+1)+". "+str(sent).strip()+".\n\n"
133
+ return (gr.Textbox.update(text),gr.Button.update(visible=False),gr.Textbox.update(visible=False))
134
+
135
+ def summarize(filepath):
136
+ extractedInformation=read(filepath)
137
+ extractedInformation=' '.join(extractedInformation.split('\n'))
138
+ output = summarizer(extractedInformation, max_length=int(len(extractedInformation)/6), min_length=int(len(extractedInformation)/10), do_sample=False)
139
+ return (gr.Textbox.update(output[0]["summary_text"]),gr.Button.update(visible=False),gr.Textbox.update(visible=False))
140
+
141
+ def Question_Answer(filepath,question):
142
+ extractedInformation=read(filepath)
143
+ extractedInformation=' '.join(extractedInformation.split('\n'))
144
+ question_answerer = pipeline("question-answering", model="SMD00/QA_model-roberta")
145
+ obj=question_answerer(question=question, context=extractedInformation)
146
+ return obj['answer']
147
+
148
+
149
+ def show_fn():
150
+ return (gr.Textbox.update(visible=True),gr.Button.update(visible=True),gr.Textbox.update(""))
151
+
152
+ with gr.Blocks() as demo:
153
+ gr.Markdown("# **PicSum**")
154
+ gr.Markdown("Gradio demo for PicSum project. You can give an image as input and select any of the three buttons. It generates summary, important sentences and answers questions related to context.")
155
+ img=gr.components.Image(type="filepath", label="Input Image")
156
+
157
+ with gr.Row():
158
+ summary = gr.Button(value="Summary")
159
+ sentence = gr.Button(value="Important Sentences")
160
+ quesAndAns = gr.Button(value="Question and Answers")
161
+
162
+ ques_box = gr.Textbox(label="Question",interactive=True,visible=False)
163
+ submit= gr.Button(value="Submit",visible=False)
164
+ out=gr.Textbox(label="Generated Text")
165
+ summary.click(fn=summarize,inputs=[img],outputs=[out,submit,ques_box])
166
+ sentence.click(fn=important_sentences,inputs=[img],outputs=[out,submit,ques_box])
167
+ quesAndAns.click(fn=show_fn,outputs=[submit,ques_box,out])
168
+ submit.click(fn=Question_Answer,inputs=[img,ques_box],outputs=[out])
169
+
170
+ demo.launch(debug=True)