yfyangd commited on
Commit
92cb60e
·
1 Parent(s): ff2a22c

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -0
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[ ]:
5
+
6
+
7
+ import PyPDF2
8
+ import jieba
9
+ import jieba.posseg as pseg
10
+ from jieba import analyse
11
+ import gradio as gr
12
+ import numpy as np
13
+ import os
14
+
15
+ def countIDF(text,topK):
16
+ tfidf = analyse.extract_tags
17
+ cipin = {}
18
+ fenci = jieba.cut(text)
19
+ for word in fenci:
20
+ if word not in cipin.keys():
21
+ cipin[word] = 0
22
+ cipin[word] += 1
23
+ keywords = tfidf(text,topK,withWeight=True)
24
+ ans = []
25
+ for keyword in keywords:
26
+ ans.append(cipin[keyword[0]])
27
+ return ans
28
+
29
+ def pers_sim(a,b):
30
+ a = np.array(a)
31
+ b = np.array(b)
32
+ a = a - np.average(a)
33
+ b = b - np.average(b)
34
+ return np.sum(a*b) / (np.sqrt(np.sum(a**2))*np.sqrt(np.sum(b**2)))
35
+
36
+ def splitWord_PersionSimlaryty(str_a,str_b,topK=20,sim=pers_sim):
37
+ vec_a = countIDF(str_a,topK)
38
+ vec_b = countIDF(str_b,topK)
39
+ return sim(vec_a,vec_b)
40
+
41
+ def similarity(A,B):
42
+ text=[]
43
+ read_pdf = PyPDF2.PdfFileReader(A)
44
+ number_of_pages = read_pdf.getNumPages()
45
+ for i in range(number_of_pages-1):
46
+ page = read_pdf.pages[i]
47
+ page_content = page.extractText()
48
+ text.append(page_content)
49
+ str_L=' '
50
+ for t in text:
51
+ str_L+=t+' '
52
+
53
+ text=[]
54
+ read_pdf = PyPDF2.PdfFileReader(B)
55
+ number_of_pages = read_pdf.getNumPages()
56
+ for i in range(number_of_pages-1):
57
+ page = read_pdf.pages[i]
58
+ page_content = page.extractText()
59
+ text.append(page_content)
60
+ str_Y=' '
61
+ for t in text:
62
+ str_Y+=t+' '
63
+
64
+ return "論文相似度: "+str(round(splitWord_PersionSimlaryty(str_L,str_Y)*100,2))+"%"
65
+
66
+ gr.Interface(similarity,["file", "file"],outputs='text').launch()
67
+