File size: 3,834 Bytes
3ced2ed
 
 
 
 
 
42d7fda
 
3ced2ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ec7c36
 
3ced2ed
 
 
 
 
 
 
 
c6a55b0
66f5dd3
ae81da2
 
 
 
 
 
 
 
 
 
 
 
 
 
42d7fda
 
39b9309
42d7fda
5938c69
 
 
42d7fda
16cfd3f
42d7fda
 
 
 
 
 
 
 
 
 
 
3ced2ed
 
 
 
 
 
 
 
 
94abe9f
3ced2ed
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from flask import Flask, request
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
import torch
import gradio as gr
import os
import re
import PyPDF2

app = Flask(__name__)

ACCESS_TOKEN = os.environ["ACCESS_TOKEN"]
config = RobertaConfig.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN)
model = RobertaForSequenceClassification.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN, config = config)

model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name, map_location=torch.device('cpu'))

# function to break text into an array of sentences
def text_to_sentences(text):
    re.sub(r'(?<=[.!?])(?=[^\s])', r' ', text)
    return re.split(r'[.!?]', text)

# function to concatenate sentences into chunks of size 600 or less
def chunks_of_600(text, chunk_size=600):
    sentences = text_to_sentences(text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk + sentence) <= chunk_size:
            current_chunk += sentence
        else:
            chunks.append(current_chunk)
            current_chunk = sentence
    chunks.append(current_chunk)
    return chunks
    
def predict(query, device="cpu"):
    tokens = tokenizer.encode(query)
    all_tokens = len(tokens)
    tokens = tokens[:tokenizer.model_max_length - 2]
    used_tokens = len(tokens)
    tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0)
    mask = torch.ones_like(tokens)

    with torch.no_grad():
        logits = model(tokens.to(device), attention_mask=mask.to(device))[0]
        probs = logits.softmax(dim=-1)

    fake, real = probs.detach().cpu().flatten().numpy().tolist()
    return real

def findRealProb(text):
    chunksOfText = (chunks_of_600(text))
    results = []
    for chunk in chunksOfText:
        output = predict(chunk)
        print(chunk)
        print("-----------------------------------")
        results.append([output, len(chunk)])
    
    ans = 0
    for prob, length in results:
        ans = ans + prob*length
    realProb = ans/len(text)
    return {"Real": realProb, "Fake": 1-realProb, "results": results, "text": text}

def upload_file(file):
   
    # if 'pdfFile' in request.files:
    #     pdf_file = request.files['pdfFile']
    #     text = ""
    #     with pdfplumber.open(pdf_file) as pdf:
    #         cnt = 0
    #         for page in pdf.pages:
    #             cnt+=1
    #             text+=(page.extract_text(x_tolerance = 1))
    #             print(text)
    #             if cnt>5: 
    #                 break
    #         return findRealProb(text)
    #         # return jsonify({'text': text})
    if file:

        with open(file.name, 'rb') as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text = ''
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text()
                
            return findRealProb(text)
        # pdf_file = file.name
        # print(file, pdf_file)
        # text = ""
        # with pdfplumber.open(pdf_file) as pdf:
        #     cnt = 0
        #     for page in pdf.pages:
        #         cnt+=1
        #         text+=(page.extract_text(x_tolerance = 1))
        #         if cnt>5: 
        #             break
        #     return findRealProb(text)
    else:
        return {"error":'No PDF file found in request'}


demo = gr.Interface(
        fn=upload_file, 
        inputs=gr.File(), 
         article = "Visit <a href = \"https://ai-content-detector.online/\">AI Content Detector</a> for better user experience!",
        outputs=gr.outputs.JSON(),
        interpretation="default",)

demo.launch(show_api=False)