from flask import Flask, request from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig import torch import gradio as gr import os import re import pdfplumber app = Flask(__name__) ACCESS_TOKEN = os.environ["ACCESS_TOKEN"] config = RobertaConfig.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN) model = RobertaForSequenceClassification.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN, config = config) model_name = "roberta-base" tokenizer = RobertaTokenizer.from_pretrained(model_name, map_location=torch.device('cpu')) # function to break text into an array of sentences def text_to_sentences(text): re.sub(r'(?<=[.!?])(?=[^\s])', r' ', text) return re.split(r'[.!?]', text) # function to concatenate sentences into chunks of size 600 or less def chunks_of_600(text, chunk_size=600): sentences = text_to_sentences(text) chunks = [] current_chunk = "" for sentence in sentences: if len(current_chunk + sentence) <= chunk_size: current_chunk += sentence else: chunks.append(current_chunk) current_chunk = sentence chunks.append(current_chunk) return chunks def predict(query, device="cpu"): tokens = tokenizer.encode(query) all_tokens = len(tokens) tokens = tokens[:tokenizer.model_max_length - 2] used_tokens = len(tokens) tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0) mask = torch.ones_like(tokens) with torch.no_grad(): logits = model(tokens.to(device), attention_mask=mask.to(device))[0] probs = logits.softmax(dim=-1) fake, real = probs.detach().cpu().flatten().numpy().tolist() return real def findRealProb(text): chunksOfText = (chunks_of_600(text)) results = [] for chunk in chunksOfText: output = predict(chunk) print(chunk) print("-----------------------------------") results.append([output, len(chunk)]) ans = 0 for prob, length in results: ans = ans + prob*length realProb = ans/len(text) return {"Real": realProb, "Fake": 1-realProb, "results": results, "text": text} def upload_file(file): if file: pdf_file = file.name print(file, pdf_file) text = "" with pdfplumber.open(pdf_file) as pdf: cnt = 0 for page in pdf.pages: cnt+=1 text+=(page.extract_text(x_tolerance = 1)) if cnt>5: break text = text.replace('\n', ' ') return findRealProb(text) else: return {"error":'No PDF file found in request'} demo = gr.Interface( fn=upload_file, inputs=gr.File(), article = "Visit AI Content Detector for better user experience!", outputs=gr.outputs.JSON(), interpretation="default",) demo.launch(show_api=False)