Spaces:

yfyangd
/

PaperSimilarity

Sleeping

App Files Files Community

yfyangd commited on Aug 13, 2022

Commit

92cb60e

1 Parent(s): ff2a22c

Upload app.py

Browse files

Files changed (1) hide show

app.py +67 -0

app.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[ ]:
+import PyPDF2
+import jieba
+import jieba.posseg as pseg
+from jieba import analyse
+import gradio as gr
+import numpy as np
+import os
+def countIDF(text,topK):
+    tfidf = analyse.extract_tags
+    cipin = {}
+    fenci = jieba.cut(text)
+    for word in fenci:
+        if word not in cipin.keys():
+            cipin[word] = 0
+        cipin[word] += 1
+    keywords = tfidf(text,topK,withWeight=True)
+    ans = []
+    for keyword in keywords:
+        ans.append(cipin[keyword[0]])
+    return ans
+def pers_sim(a,b):
+    a = np.array(a)
+    b = np.array(b)
+    a = a - np.average(a)
+    b = b - np.average(b)
+    return np.sum(a*b) / (np.sqrt(np.sum(a**2))*np.sqrt(np.sum(b**2)))
+def splitWord_PersionSimlaryty(str_a,str_b,topK=20,sim=pers_sim):
+    vec_a = countIDF(str_a,topK)
+    vec_b = countIDF(str_b,topK)
+    return sim(vec_a,vec_b)
+def similarity(A,B):
+    text=[]
+    read_pdf = PyPDF2.PdfFileReader(A)
+    number_of_pages = read_pdf.getNumPages()
+    for i in range(number_of_pages-1):
+        page = read_pdf.pages[i]
+        page_content = page.extractText()
+        text.append(page_content)
+    str_L=' '
+    for t in text:
+        str_L+=t+' '
+    text=[]
+    read_pdf = PyPDF2.PdfFileReader(B)
+    number_of_pages = read_pdf.getNumPages()
+    for i in range(number_of_pages-1):
+        page = read_pdf.pages[i]
+        page_content = page.extractText()
+        text.append(page_content)
+    str_Y=' '
+    for t in text:
+        str_Y+=t+' '
+    return "論文相似度: "+str(round(splitWord_PersionSimlaryty(str_L,str_Y)*100,2))+"%"
+gr.Interface(similarity,["file", "file"],outputs='text').launch()