File size: 7,271 Bytes
055dfd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195d5cb
 
055dfd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0650090
055dfd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0650090
 
 
055dfd2
 
 
 
 
 
 
 
 
 
 
 
 
0650090
 
 
 
055dfd2
 
 
9774ecb
055dfd2
 
 
 
1f0fa3b
 
055dfd2
9774ecb
055dfd2
 
9774ecb
 
 
 
055dfd2
 
 
9774ecb
 
 
 
 
055dfd2
 
 
 
9774ecb
055dfd2
1f0fa3b
 
 
055dfd2
9774ecb
 
1f0fa3b
 
 
 
 
 
9774ecb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12b481f
9774ecb
 
 
 
 
 
055dfd2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import gradio as gr
from PIL import Image
import pytesseract
import torch
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import networkx as nx
from transformers import pipeline


if torch.cuda.is_available():
   device = torch.device("cuda")
else:
   device = torch.device("cpu")


summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def read(filepath):
    return pytesseract.image_to_string(Image.open(filepath))

def clean_text(text):
  article = text.split(".")
  article=[sentence for sentence in article if sentence!=""]

  sentences = []

  for sentence in article:
      sentence=sentence.replace(",", " , ").replace("'", " ' ").split(" ")
      sentence=[word for word in sentence if word!=""]
      sentences.append(sentence)

  return sentences

def sentence_similarity(sent1, sent2, stopwords):   #Creating words in sentences to one hot encoding and then finding cosine distance between the vectors inorder to measure closeness

    if stopwords is None:
        stopwords = []

    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]

    all_words = list(set(sent1 + sent2))

    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1

    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
    if np.isnan(1 - cosine_distance(vector1, vector2)):
        return 0
    return 1 - cosine_distance(vector1, vector2)


def build_similarity_matrix(sentences, stop_words):

    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))

    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

def sentences(text, top_n="auto"):

    # Step 1 - Clean text to generate sentences

    sentences=clean_text(text)
    stop_words = stopwords.words('english')
    stop_words.append(".")
    stop_words.append(",")
    summarize_text = []

    # Step 2 - Generate Similary Martix across sentences

    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)
    # print(sentence_similarity_martix)

    # Step 3 - Rank sentences in similarity martix

    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    # print(sentence_similarity_graph)

    scores = nx.pagerank(sentence_similarity_graph)
    # print(scores)

    # Step 4 - Sort the rank and pick top sentences

    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    #Sorting the scores in decending order
    # print("Indexes of top ranked_sentence order are ", ranked_sentence)

    if top_n=="auto": top_n=len(ranked_sentence)
    else: top_n=int(top_n)

    for i in range(top_n):
      ranked_sentence[i][1][0]=ranked_sentence[i][1][0].capitalize()    #Capitalising 1st letter of sentence
      # print(ranked_sentence[i][1][0])
      summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarized text

    extractive_summarized=". ".join(summarize_text).replace(" , ",", ").replace(" ' ","'") + "."
    return extractive_summarized

def important_sentences(filepath, no_of_sentences=5):
  extractedInformation=read(filepath)
  extractedInformation=' '.join(extractedInformation.split('\n'))
  try:
    extractive_summary=sentences(extractedInformation, no_of_sentences)
  except:
    extractive_summary=sentences(extractedInformation,"auto")
  text=""
  for index,sent in enumerate(extractive_summary.split(".")):
    if sent!='':text+=str(index+1)+". "+str(sent).strip()+".\n\n"
  return (gr.Textbox.update(text),gr.Button.update(visible=False),gr.Textbox.update(visible=False),gr.Dropdown.update(visible=False))

def summarize(filepath):
  extractedInformation=read(filepath)
  extractedInformation=' '.join(extractedInformation.split('\n'))
  abstractive_summary = summarizer(extractedInformation, max_length=int(len(extractedInformation)/6), min_length=int(len(extractedInformation)/10), do_sample=False)
  return (gr.Textbox.update(abstractive_summary[0]["summary_text"]),gr.Button.update(visible=False),gr.Textbox.update(visible=False),gr.Dropdown.update(visible=False))

def Question_Answer(filepath,question,mod):
  extractedInformation=read(filepath)
  extractedInformation=' '.join(extractedInformation.split('\n'))
  if mod=="Roberta":
    question_answerer = pipeline("question-answering", model="SMD00/QA_model-roberta")
  else :
    question_answerer = pipeline("question-answering", model="SMD00/QA_model-distilbert")
  obj=question_answerer(question=question, context=extractedInformation)
  return obj['answer']

def show_fn():
    return (gr.Textbox.update(visible=True),gr.Button.update(visible=True),gr.Dropdown.update(visible=True),gr.Textbox.update(""))
def dummy_fn(x):
  return x

with gr.Blocks() as demo:
    gr.Markdown("# **PicSum**")
    gr.Markdown("Gradio demo for PicSum project. You can give an image as input and select any of the three buttons. It generates summary, important sentences and answers questions related to context.")
    img=gr.components.Image(type="filepath", label="Input Image")

    with gr.Row():
        summary_btn = gr.Button(value="Summary")
        sentence_btn = gr.Button(value="Important Sentences")
        quesAndAns_btn = gr.Button(value="Question and Answers")

    mode=gr.Dropdown(["Roberta","DistilBert"],label="Model",info="Choose a model",visible=False)
    ques_box = gr.Textbox(label="Question",info="Enter a Question",interactive=True,visible=False)
    submit_btn= gr.Button(value="Submit",visible=False)
    out_box=gr.Textbox(label="Generated Text")
    summary_btn.click(fn=summarize,inputs=[img],outputs=[out_box,submit_btn,ques_box,mode])
    sentence_btn.click(fn=important_sentences,inputs=[img],outputs=[out_box,submit_btn,ques_box,mode])
    quesAndAns_btn.click(fn=show_fn,outputs=[submit_btn,ques_box,mode,out_box])
    submit_btn.click(fn=Question_Answer,inputs=[img,ques_box,mode],outputs=[out_box])
    gr.Markdown("## Image Examples")
    with gr.Row():
        gr.Examples(
            examples=[ "a.png"],
            inputs=img,
            outputs=img,
            fn=dummy_fn,
            cache_examples=True,
        )
        gr.Examples(
            examples=[ "b.png"],
            inputs=img,
            outputs=img,
            fn=dummy_fn,
            cache_examples=True,
        )
        gr.Examples(
            examples=[ "c.png"],
            inputs=img,
            outputs=img,
            fn=dummy_fn,
            cache_examples=True,
        )
        gr.Examples(
            examples=[ "d.png"],
            inputs=img,
            outputs=img,
            fn=dummy_fn,
            cache_examples=True,
        )
demo.launch(debug=True)