Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| # In[ ]: | |
| import PyPDF2 | |
| import jieba | |
| import jieba.posseg as pseg | |
| from jieba import analyse | |
| import gradio as gr | |
| import numpy as np | |
| import os | |
| def countIDF(text,topK): | |
| tfidf = analyse.extract_tags | |
| cipin = {} | |
| fenci = jieba.cut(text) | |
| for word in fenci: | |
| if word not in cipin.keys(): | |
| cipin[word] = 0 | |
| cipin[word] += 1 | |
| keywords = tfidf(text,topK,withWeight=True) | |
| ans = [] | |
| for keyword in keywords: | |
| ans.append(cipin[keyword[0]]) | |
| return ans | |
| def pers_sim(a,b): | |
| a = np.array(a) | |
| b = np.array(b) | |
| a = a - np.average(a) | |
| b = b - np.average(b) | |
| return np.sum(a*b) / (np.sqrt(np.sum(a**2))*np.sqrt(np.sum(b**2))) | |
| def splitWord_PersionSimlaryty(str_a,str_b,topK=20,sim=pers_sim): | |
| vec_a = countIDF(str_a,topK) | |
| vec_b = countIDF(str_b,topK) | |
| return sim(vec_a,vec_b) | |
| def similarity(A,B): | |
| text=[] | |
| read_pdf = PyPDF2.PdfFileReader(A) | |
| number_of_pages = read_pdf.getNumPages() | |
| for i in range(number_of_pages-1): | |
| page = read_pdf.pages[i] | |
| page_content = page.extractText() | |
| text.append(page_content) | |
| str_L=' ' | |
| for t in text: | |
| str_L+=t+' ' | |
| text=[] | |
| read_pdf = PyPDF2.PdfFileReader(B) | |
| number_of_pages = read_pdf.getNumPages() | |
| for i in range(number_of_pages-1): | |
| page = read_pdf.pages[i] | |
| page_content = page.extractText() | |
| text.append(page_content) | |
| str_Y=' ' | |
| for t in text: | |
| str_Y+=t+' ' | |
| return "論文相似度: "+str(round(splitWord_PersionSimlaryty(str_L,str_Y)*100,2))+"%" | |
| title="Paper Similarity 論文相似度比較" | |
| description=''' | |
| National Taiwan University on Tuesday (August 9) announced a decision to rescind a master's degree it gave to Lin Chih-chien (林智堅) in 2017, citing plagiarism after a meeting by the school's academic ethics committee. | |
| "The act sullied the reputation of National Taiwan University...and the school will reinforce the importance of academic integrity and ethics, not letting it happen again." | |
| With this in mind, we proposed machine learning method to analyze the similarity between 2 papers. Provide an objective indicator for your reference. | |
| 台大周二(8月9日)宣布撤銷2017年授予林智堅的碩士學位,理由是該校學術倫理委員會開會後認為存在抄襲。 | |
| “該行為玷污了台大的聲譽……學校將加強學術誠信和道德的重要性,不會讓這種事再次發生。” | |
| 考慮到這一點,我們提出了機器學習方法來分析兩篇論文之間的相似性。 提供一個客觀的指標供您參考。 | |
| <th> | |
| <iframe width="560" height="315" src="https://www.youtube.com/embed/TQNzsQ6I69k" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> | |
| </th> | |
| ### Please upload 2 papers below, the format is limited to pdf | |
| ### 以下請輸入2篇論文, 格式限定pdf | |
| ''' | |
| demo = gr.Interface(similarity,["file", "file"],outputs='text',title=title,description=description).launch() |