File size: 9,616 Bytes
35b7ac6
 
f24c4ab
633e441
35b7ac6
 
633e441
f24c4ab
633e441
f24c4ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
633e441
f24c4ab
 
633e441
f24c4ab
 
 
633e441
 
 
 
 
 
 
 
 
 
f24c4ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
633e441
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35b7ac6
633e441
 
35b7ac6
 
633e441
 
 
 
 
 
 
 
f24c4ab
 
 
 
633e441
f24c4ab
 
 
 
 
 
 
 
 
 
633e441
 
 
 
 
 
 
043812c
 
 
 
 
 
 
 
 
 
 
 
9886933
 
d740850
9886933
043812c
 
f24c4ab
633e441
 
 
 
9a3b49f
 
e723247
 
9a3b49f
633e441
 
373753a
633e441
 
9a3b49f
043812c
 
 
 
 
 
 
 
 
 
633e441
 
 
9a3b49f
 
 
ae4db81
 
 
 
633e441
 
9a3b49f
7280a44
9a3b49f
633e441
 
9a3b49f
043812c
 
 
 
 
 
 
 
 
 
633e441
 
9a3b49f
 
373753a
 
 
 
 
 
e723247
043812c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import gradio as gr
import tensorflow as tf
import keras
import numpy as np
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import os

# Define and register the custom Perplexity metric
@keras.saving.register_keras_serializable(package="Custom")
class Perplexity(keras.metrics.Metric):
    def __init__(self, name='perplexity', dtype=None, **kwargs):
        super().__init__(name=name, dtype=dtype, **kwargs)
        self.cross_entropy = keras.metrics.Mean(name='cross_entropy')
        
    def update_state(self, y_true, y_pred, sample_weight=None):
        # Calculate cross-entropy
        cross_entropy_values = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred)
        # Update the internal mean metric
        self.cross_entropy.update_state(cross_entropy_values, sample_weight)
        
    def result(self):
        # Perplexity is the exponential of the cross-entropy
        return tf.exp(self.cross_entropy.result())
    
    def reset_state(self):
        self.cross_entropy.reset_state()
    
    def get_config(self):
        config = super().get_config()
        return config

# Text cleaning function
def clean_text(text):
    text = re.sub(r'[^\w\s.,!?]', '', text)
    text = re.sub(r'\b\d+\b', '', text)
    text = text.replace('co2', 'carbon dioxide')
    text = text.lower()
    text = ' '.join(text.split())
    return text

# Load models and tokenizers
def load_models():
    print("Loading models and tokenizers...")
    
    # Load models with custom objects for Perplexity
    custom_objects = {'Perplexity': Perplexity}
    
    try:
        with keras.saving.custom_object_scope(custom_objects):
            classifier_model = keras.models.load_model('classifier_model.keras')
            textgen_model = keras.models.load_model('textgen_model.keras')
        
        print("Models loaded successfully with custom objects")
    except Exception as e:
        print(f"Error loading models with custom objects: {e}")
        raise
    
    # Load tokenizers
    try:
        with open('classifier_tokenizer.pkl', 'rb') as handle:
            classifier_tokenizer = pickle.load(handle)
        
        with open('textgen_tokenizer.pkl', 'rb') as handle:
            textgen_tokenizer = pickle.load(handle)
        
        print("Tokenizers loaded successfully")
    except Exception as e:
        print(f"Error loading tokenizers: {e}")
        raise
    
    return classifier_model, classifier_tokenizer, textgen_model, textgen_tokenizer

# Classification function
def classify_text(text, model, tokenizer):
    cleaned_text = clean_text(text)
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded = pad_sequences(sequence, maxlen=255, padding='pre')
    prediction = model.predict(padded)[0]
    
    # Get the highest probability class
    class_idx = np.argmax(prediction)
    classes = ['Science', 'Maths', 'History']
    confidence = prediction[class_idx] * 100
    
    return classes[class_idx], confidence

# Text generation function
def generate_text(prompt, model, tokenizer, max_length=50, temperature=0.7):
    cleaned_prompt = clean_text(prompt)
    input_text = cleaned_prompt
    
    for _ in range(max_length):
        token_list = tokenizer.texts_to_sequences([input_text])[0]
        token_list = pad_sequences([token_list], maxlen=255, padding='pre')
        
        predicted = model.predict(token_list, verbose=0)[0]
        
        # Apply temperature
        predicted = np.log(predicted) / temperature
        exp_preds = np.exp(predicted)
        predicted = exp_preds / np.sum(exp_preds)
        
        # Sample from the distribution
        predicted_index = np.random.choice(len(predicted), p=predicted)
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        
        if output_word == "":
            break
            
        input_text += " " + output_word
    
    return input_text

# Print environment info for debugging
print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {tf.keras.__version__}")
print(f"Current directory contents: {os.listdir('.')}")

# Load models with error handling
try:
    print("Starting model loading process...")
    classifier_model, classifier_tokenizer, textgen_model, textgen_tokenizer = load_models()
    print("Models and tokenizers loaded successfully")
except Exception as e:
    print(f"Error in model loading process: {e}")
    raise

# Create Gradio interface functions
def classify_interface(text):
    subject, confidence = classify_text(text, classifier_model, classifier_tokenizer)
    return f"Subject: {subject} (Confidence: {confidence:.2f}%)"

def generate_interface(prompt, length=50, temp=0.7):
    return generate_text(prompt, textgen_model, textgen_tokenizer, max_length=int(length), temperature=float(temp))

# Define example inputs for the classifier
classifier_examples = [
    ["The process of photosynthesis converts light energy into chemical energy, producing oxygen as a byproduct."],
    ["The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse equals the sum of squares of the other two sides."],
    ["The Industrial Revolution began in Great Britain in the late 18th century and spread to other parts of Europe and North America."],
    ["Atoms consist of a nucleus containing protons and neutrons, surrounded by electrons that orbit in energy levels."],
    ["Differential equations are mathematical equations that relate a function with its derivatives, representing rates of change."],
    ["The Treaty of Versailles was signed in 1919, officially ending World War I and imposing harsh penalties on Germany."]
]

# Define example inputs for the text generator
generator_examples = [
    ["Newton's laws of motion explain", 30, 0.8],
    ["Climate change affects ecosystems by", 20, 0.7],
    ["Quantum mechanics revolutionized physics when", 20, 0.9],
    ["Chemical reactions occur when", 25, 0.6]
]

# Create Gradio interface
with gr.Blocks(title="Science Text Analyzer") as demo:
    gr.Markdown("# Science Text Analyzer")
    
    with gr.Tab("Classify Text"):
        gr.Markdown("### Classify Academic Text")
        gr.Markdown(
            "The **Science Text Analyzer** uses an **LSTM-based text classification model** trained on curated academic datasets sourced from **Hugging Face**. "
            "It predicts whether input text belongs to **Science**, **Mathematics**, or **History**, leveraging **sequential context** and **language structure** for accurate subject classification."
        )
        with gr.Row():
            with gr.Column():
                text_input = gr.Textbox(label="Enter Text", lines=5, placeholder="Paste a sentence or paragraph here...")
                classify_button = gr.Button("Classify")
            with gr.Column():
                output = gr.Textbox(label="Classification Result", placeholder="The predicted subject and confidence will appear here.")
        
        # Add examples for the classifier
        gr.Examples(
            examples=classifier_examples,
            inputs=text_input,
            outputs=output,
            fn=classify_interface,
            cache_examples=True
        )
        
        classify_button.click(fn=classify_interface, inputs=text_input, outputs=output)
    
    with gr.Tab("Generate Text"):
        gr.Markdown("### Generate Academic Text")
        gr.Markdown(
            "Use this tool to generate educational text based on a given prompt. "
            "You can control the output length and creativity using the sliders below. "
            "**Note:** Longer text lengths will take more time to generate, so please be patient when requesting extensive outputs."
            )

        with gr.Row():
            with gr.Column():
                prompt_input = gr.Textbox(label="Enter a Prompt", lines=3, placeholder="Type an introductory sentence or concept...")
                length_slider = gr.Slider(minimum=10, maximum=60, value=50, step=10, label="Maximum Length (words)")
                temp_slider = gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature (Creativity Level)")
                generate_button = gr.Button("Generate")
            with gr.Column():
                generated_output = gr.Textbox(label="Generated Text", lines=8, placeholder="The generated text will appear here.")
        
        # Add examples for the text generator
        gr.Examples(
            examples=generator_examples,
            inputs=[prompt_input, length_slider, temp_slider],
            outputs=generated_output,
            fn=generate_interface,
            cache_examples=True
        )
        
        generate_button.click(fn=generate_interface, inputs=[prompt_input, length_slider, temp_slider], outputs=generated_output)

    gr.Markdown("### About This App")
    gr.Markdown(
    "The **Science Text Analyzer** uses deep learning models trained on curated academic datasets to classify and generate content "
    "related to academic disciplines. The **classifier** categorizes input text into one of three subjects: **Science**, **Mathematics**, or **History**. "
    "The **text generator** produces coherent scientific passages, especially focused on **Physics**, **Chemistry**, and **Biology**, making it a valuable tool "
    "for **educational research**, **content creation**, and **curriculum support**."
)

# Launch the app
demo.launch()