File size: 3,190 Bytes
1c38e8c
3118f3a
1360051
 
4457e86
aab722c
3118f3a
1360051
ce337fb
f973f9e
 
ce337fb
bcd23af
3118f3a
4457e86
1360051
ce337fb
ed60e85
 
 
3118f3a
 
 
 
 
 
ed60e85
 
 
 
d90983a
ce337fb
1360051
3118f3a
 
 
 
1360051
 
6adf923
c0a3abb
a9d8b80
1360051
ce337fb
 
3118f3a
1360051
 
 
 
 
c0a3abb
1360051
 
 
3118f3a
ce337fb
1360051
 
ce337fb
f973f9e
1360051
2b37588
3118f3a
 
2b37588
1360051
6adf923
3118f3a
 
 
 
6adf923
fcb88c5
 
 
6adf923
bcd23af
f973f9e
ed60e85
1360051
 
ce337fb
 
 
fcb88c5
 
 
 
3118f3a
fcb88c5
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import streamlit as st
import tensorflow as tf
import numpy as np
import nltk
import os
from nltk.tokenize import sent_tokenize
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification

# ๐Ÿ“ Hugging Face cache dir
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"

# ๐Ÿ“ฅ Download NLTK punkt tokenizer
nltk_data_path = "/tmp/nltk_data"
nltk.download("punkt_tab", download_dir=nltk_data_path)  # โœ… fixed: should be "punkt", not "punkt_tab"
nltk.data.path.append(nltk_data_path)

# โœ… Cache the model/tokenizer
@st.cache_resource
def load_model_and_tokenizer():
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        "distilbert-base-uncased",
        cache_dir="/tmp/huggingface"
    )
    model = TFDistilBertForSequenceClassification.from_pretrained(
        "sundaram07/distilbert-sentence-classifier",
        cache_dir="/tmp/huggingface"
    )
    return tokenizer, model

tokenizer, model = load_model_and_tokenizer()

# ๐Ÿ”ฎ Predict sentence AI probability
def predict_sentence_ai_probability(sentence):
    inputs = tokenizer(sentence, return_tensors="tf", truncation=True, padding=True)
    outputs = model(inputs)
    logits = outputs.logits
    prob_ai = tf.sigmoid(logits)[0][0].numpy()
    return prob_ai

# ๐Ÿ“Š Analyze text
def predict_ai_generated_percentage(text, threshold=0.15):
    text = text.strip()
    sentences = sent_tokenize(text)
    if len(sentences) == 0:
        return 0.0, []
    
    ai_sentence_count = 0
    results = []

    for sentence in sentences:
        prob = predict_sentence_ai_probability(sentence)
        is_ai = prob <= threshold
        results.append((sentence, prob, is_ai))
        if is_ai:
            ai_sentence_count += 1
    
    ai_percentage = (ai_sentence_count / len(sentences)) * 100
    return ai_percentage, results

# ๐Ÿ–ฅ๏ธ Streamlit UI
st.set_page_config(page_title="AI Detector", layout="wide")
st.title("๐Ÿง  AI Content Detector")
st.markdown(
    "This app detects the percentage of **AI-generated content** using "
    "sentence-level analysis with a fine-tuned DistilBERT model."
)

# ๐Ÿ“‹ Text input
user_input = st.text_area(
    "๐Ÿ“‹ Paste your text below to check for AI-generated sentences:",
    height=300
)

# ๐Ÿ“ค Output placeholder (to clear previous results)
output_container = st.empty()

# ๐Ÿ” Analyze button logic
if st.button("๐Ÿ” Analyze"):
    if not user_input.strip():
        st.warning("โš ๏ธ Please enter some text.")
    else:
        ai_percentage, analysis_results = predict_ai_generated_percentage(user_input)
        if len(analysis_results) == 0:
            st.warning("โš ๏ธ Not enough valid sentences to analyze.")
        else:
            with output_container.container():
                st.subheader("๐Ÿ” Sentence-level Analysis")
                for i, (sentence, prob, is_ai) in enumerate(analysis_results, start=1):
                    label = "๐ŸŸข Human" if not is_ai else "๐Ÿ”ด AI"
                    st.markdown(f"**{i}.** _{sentence}_\n\n โ†’ {label} (prob={prob:.3f})")

                st.subheader("๐Ÿ“Š Final Result")
                st.success(f"Estimated **AI-generated content**: **{ai_percentage:.2f}%**")