File size: 3,058 Bytes
3ced2ed
 
 
 
 
 
3dfc25f
42d7fda
3ced2ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ec7c36
 
3ced2ed
 
 
 
 
 
 
 
c6a55b0
66f5dd3
c43109e
 
 
 
 
 
 
 
 
 
 
5f78e9a
c43109e
3ced2ed
 
 
 
 
 
 
 
 
94abe9f
3ced2ed
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from flask import Flask, request
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
import torch
import gradio as gr
import os
import re
import pdfplumber

app = Flask(__name__)

ACCESS_TOKEN = os.environ["ACCESS_TOKEN"]
config = RobertaConfig.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN)
model = RobertaForSequenceClassification.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN, config = config)

model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name, map_location=torch.device('cpu'))

# function to break text into an array of sentences
def text_to_sentences(text):
    re.sub(r'(?<=[.!?])(?=[^\s])', r' ', text)
    return re.split(r'[.!?]', text)

# function to concatenate sentences into chunks of size 600 or less
def chunks_of_600(text, chunk_size=600):
    sentences = text_to_sentences(text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk + sentence) <= chunk_size:
            current_chunk += sentence
        else:
            chunks.append(current_chunk)
            current_chunk = sentence
    chunks.append(current_chunk)
    return chunks
    
def predict(query, device="cpu"):
    tokens = tokenizer.encode(query)
    all_tokens = len(tokens)
    tokens = tokens[:tokenizer.model_max_length - 2]
    used_tokens = len(tokens)
    tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0)
    mask = torch.ones_like(tokens)

    with torch.no_grad():
        logits = model(tokens.to(device), attention_mask=mask.to(device))[0]
        probs = logits.softmax(dim=-1)

    fake, real = probs.detach().cpu().flatten().numpy().tolist()
    return real

def findRealProb(text):
    chunksOfText = (chunks_of_600(text))
    results = []
    for chunk in chunksOfText:
        output = predict(chunk)
        print(chunk)
        print("-----------------------------------")
        results.append([output, len(chunk)])
    
    ans = 0
    for prob, length in results:
        ans = ans + prob*length
    realProb = ans/len(text)
    return {"Real": realProb, "Fake": 1-realProb, "results": results, "text": text}

def upload_file(file):
   
    if file:
        pdf_file = file.name
        print(file, pdf_file)
        text = ""
        with pdfplumber.open(pdf_file) as pdf:
            cnt = 0
            for page in pdf.pages:
                cnt+=1
                text+=(page.extract_text(x_tolerance = 1))
                if cnt>5: 
                    break
            text = text.replace('\n', ' ')
            return findRealProb(text)
    else:
        return {"error":'No PDF file found in request'}


demo = gr.Interface(
        fn=upload_file, 
        inputs=gr.File(), 
         article = "Visit <a href = \"https://ai-content-detector.online/\">AI Content Detector</a> for better user experience!",
        outputs=gr.outputs.JSON(),
        interpretation="default",)

demo.launch(show_api=False)