# from flask import Flask, render_template, request
# from transformers import BertForSequenceClassification, BertTokenizer
# from language_tool_python import LanguageTool
# from spellchecker import SpellChecker
# from collections import Counter
# import string
# import torch
# import numpy as np
# import os
# from pathlib import Path

# app = Flask(__name__)

# # Configure cache directories
# cache_base = os.getenv('XDG_CACHE_HOME', '/tmp/cache')
# huggingface_cache = os.path.join(cache_base, 'huggingface')
# languagetool_cache = os.path.join(cache_base, 'languagetool')

# # Ensure directories exist
# Path(huggingface_cache).mkdir(parents=True, exist_ok=True)
# Path(languagetool_cache).mkdir(parents=True, exist_ok=True)

# # Initialize LanguageTool
# try:
#     grammar_tool = LanguageTool(
#         'en-US',
#         remote_server='https://api.languagetool.org'
#     )
#     print("LanguageTool initialized successfully")
# except Exception as e:
#     print(f"Error initializing LanguageTool: {e}")
#     grammar_tool = None

# # Initialize SpellChecker
# spell = SpellChecker()

# # Load Hugging Face models
# MODEL_NAME = "Hak978/aes-bert-models"
# try:
#     # Load models using subfolder parameter
#     model_website1 = BertForSequenceClassification.from_pretrained(
#         MODEL_NAME,
#         subfolder="essay_scoring_model_regression_20240228_123826",
#         cache_dir=huggingface_cache
#     )
#     model_website2 = BertForSequenceClassification.from_pretrained(
#         MODEL_NAME,
#         subfolder="essay_scoring_model_regression_20240229_133324",
#         cache_dir=huggingface_cache
#     )
    
#     # Load tokenizer
#     tokenizer = BertTokenizer.from_pretrained(
#         'bert-base-uncased',
#         cache_dir=huggingface_cache
#     )
#     print("Models loaded successfully")
# except Exception as e:
#     print(f"Error loading models: {e}")
#     model_website1 = model_website2 = tokenizer = None

# def check_spelling(text):
#     words = text.split()
#     misspelled = spell.unknown(words)
#     return list(misspelled)

# def check_grammar(text):
#     if grammar_tool is None:
#         return []
#     matches = grammar_tool.check(text)
#     return [{'message': match.message, 'replacements': match.replacements} for match in matches]

# def count_words(text):
#     words = text.split()
#     return len(words)

# def calculate_sentence_lengths(text):
#     sentences = text.split('.')
#     lengths = [len(sentence.split()) for sentence in sentences if sentence.strip()]
#     return {
#         'average': np.mean(lengths) if lengths else 0,
#         'min': min(lengths) if lengths else 0,
#         'max': max(lengths) if lengths else 0
#     }

# def calculate_vocabulary_diversity(text):
#     words = text.lower().split()
#     unique_words = set(words)
#     return len(unique_words) / len(words) if words else 0

# def count_punctuation(text):
#     return sum(1 for char in text if char in string.punctuation)

# def predict_score(text, model, tokenizer):
#     if model is None or tokenizer is None:
#         return 0.0
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
#     with torch.no_grad():
#         outputs = model(**inputs)
#         predictions = outputs.logits
#     return predictions.item()

# @app.route('/')
# def home():
#     return render_template('index.html')

# @app.route('/health')
# def health_check():
#     return {'status': 'healthy'}, 200

# @app.route('/analyze', methods=['POST'])
# def analyze():
#     if request.method == 'POST':
#         essay_text = request.form['essay']
        
#         feedback = {
#             'word_count': count_words(essay_text),
#             'spelling_errors': check_spelling(essay_text),
#             'grammar_errors': check_grammar(essay_text),
#         }

#         if model_website1 and model_website2 and tokenizer:
#             score1 = predict_score(essay_text, model_website1, tokenizer)
#             score2 = predict_score(essay_text, model_website2, tokenizer)
#             feedback.update({
#                 'score1': round(score1, 2),
#                 'score2': round(score2, 2),
#                 'average_score': round((score1 + score2) / 2, 2)
#             })
        
#         sentence_stats = calculate_sentence_lengths(essay_text)
#         feedback.update({
#             'avg_sentence_length': round(sentence_stats['average'], 2),
#             'min_sentence_length': int(sentence_stats['min']),
#             'max_sentence_length': int(sentence_stats['max']),
#             'vocabulary_diversity': round(calculate_vocabulary_diversity(essay_text) * 100, 2),
#             'punctuation_count': count_punctuation(essay_text)
#         })
        
#         return render_template('result.html', feedback=feedback)

# if __name__ == '__main__':
#     port = int(os.environ.get('PORT', 7860))  # Hugging Face uses 7860
#     app.run(host='0.0.0.0', port=port)


from flask import Flask, render_template, request
from transformers import BertForSequenceClassification, BertTokenizer
from language_tool_python import LanguageTool
from spellchecker import SpellChecker
from collections import Counter
import string
import torch
import numpy as np
import os
from pathlib import Path

app = Flask(__name__, template_folder='.')

# Configure cache directories
cache_base = os.getenv('XDG_CACHE_HOME', '/tmp/cache')
huggingface_cache = os.path.join(cache_base, 'huggingface')
languagetool_cache = os.path.join(cache_base, 'languagetool')

# Ensure directories exist
Path(huggingface_cache).mkdir(parents=True, exist_ok=True)
Path(languagetool_cache).mkdir(parents=True, exist_ok=True)

# Initialize LanguageTool
try:
    grammar_tool = LanguageTool(
        'en-US',
        remote_server='https://api.languagetool.org'
    )
    print("LanguageTool initialized successfully")
except Exception as e:
    print(f"Error initializing LanguageTool: {e}")
    grammar_tool = None

# Initialize SpellChecker
spell = SpellChecker()

# Load Hugging Face models
MODEL_NAME = "Hak978/aes-bert-models"
try:
    # Load models using subfolder parameter
    model_website1 = BertForSequenceClassification.from_pretrained(
        MODEL_NAME,
        subfolder="essay_scoring_model_regression_20240228_123826",
        cache_dir=huggingface_cache
    )
    model_website2 = BertForSequenceClassification.from_pretrained(
        MODEL_NAME,
        subfolder="essay_scoring_model_regression_20240229_133324",
        cache_dir=huggingface_cache
    )
    
    # Load tokenizer
    tokenizer = BertTokenizer.from_pretrained(
        'bert-base-uncased',
        cache_dir=huggingface_cache
    )
    print("Models loaded successfully")
except Exception as e:
    print(f"Error loading models: {e}")
    model_website1 = model_website2 = tokenizer = None

def tokenize_text(text, tokenizer):
    tokens = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    return tokens['input_ids'], tokens['attention_mask']

def normalize_bert_score(raw_score, category, essay):
    params = {
        'grammar': {'min': 1, 'max': 8, 'threshold': 0.8},
        'lexical': {'min': 1, 'max': 8, 'threshold': 0.8},
        'global_organization': {'min': 3, 'max': 8, 'threshold': 0.6},
        'local_organization': {'min': 3, 'max': 8, 'threshold': 0.6},
        'supporting_ideas': {'min': 3, 'max': 8, 'threshold': 0.6},
        'holistic': {'min': 1, 'max': 5, 'threshold': 0.9}
    }
    
    category_params = params[category]
    error_count = len(grammar_tool.check(essay)) if grammar_tool else 0
    words = essay.split()
    spelling_errors = len(spell.unknown(words)) if spell else 0
    
    error_density = (error_count + spelling_errors) / len(words) if words else 1
    penalty = error_density * 7
    
    base_score = category_params['min'] + (raw_score * (category_params['max'] - category_params['min']))
    
    if category in ['grammar', 'lexical', 'holistic']:
        base_score = max(category_params['min'], base_score - penalty)
    
    return round(max(category_params['min'], min(category_params['max'], base_score)), 1)

def get_predictions_website1(essays):
    if not model_website1 or not tokenizer:
        return []
        
    input_ids = []
    attention_masks = []
    
    for essay in essays:
        tokens = tokenize_text(essay, tokenizer)
        input_ids.append(tokens[0])
        attention_masks.append(tokens[1])
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    model_website1.eval()
    with torch.no_grad():
        outputs = model_website1(input_ids, attention_mask=attention_masks)
        raw_predictions = outputs.logits.cpu().numpy()
    
    normalized_predictions = []
    categories = ['grammar', 'lexical', 'global_organization', 
                 'local_organization', 'supporting_ideas', 'holistic']
    
    for raw_pred in raw_predictions:
        raw_scores = 1 / (1 + np.exp(-raw_pred))
        norm_pred = [
            normalize_bert_score(score, category, essays[0])
            for score, category in zip(raw_scores, categories)
        ]
        normalized_predictions.append(norm_pred)
    
    return normalized_predictions

def calculate_grammar_score(essay):
    if not grammar_tool:
        return None
        
    matches = grammar_tool.check(essay)
    error_weights = {
        'SPELLING': 2.0,
        'GRAMMAR': 2.5,
        'PUNCTUATION': 1.5,
        'TYPOGRAPHY': 1.0
    }
    
    weighted_errors = 0
    for match in matches:
        weight = error_weights.get(match.category, 1.5)
        weighted_errors += weight
    
    words = len(essay.split())
    error_density = (weighted_errors / words) * 100 if words > 0 else 100
    
    base_score = 10 - (error_density * 0.7)
    error_types = Counter(match.category for match in matches)
    repeated_error_penalty = sum(count * 0.3 for count in error_types.values() if count > 2)
    
    final_score = base_score - repeated_error_penalty
    return round(max(2, min(10, final_score)), 1)

def calculate_spelling_score(essay):
    words = [word.strip('.,!?()[]{}":;') for word in essay.split()]
    misspelled = spell.unknown(words) if spell else []
    
    total_words = len(words)
    error_count = len(misspelled)
    error_rate = error_count / total_words if total_words > 0 else 1
    
    error_penalty = error_rate * 20
    if error_count > 5:
        error_penalty += (error_count - 5) * 0.5
    
    spelling_score = 10 - error_penalty
    return round(max(2, min(10, spelling_score)), 1)

def calculate_word_diversity(essay):
    words = essay.lower().translate(str.maketrans('', '', string.punctuation)).split()
    if not words:
        return 7.0
        
    misspelled = spell.unknown(words) if spell else []
    spelling_penalty = len(misspelled) / len(words) * 5
    
    stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
    content_words = [word for word in words if word not in stop_words]
    
    if not content_words:
        return 7.0
    
    total_words = len(content_words)
    unique_words = len(set(content_words))
    word_freq = Counter(content_words)
    repeated_words = sum(1 for count in word_freq.values() if count > 2)
    
    diversity_ratio = unique_words / total_words
    repetition_penalty = min(1.5, repeated_words / unique_words)
    
    base_score = 8 + (2 * diversity_ratio)
    final_score = base_score - repetition_penalty - spelling_penalty
    
    return round(max(5, min(10, final_score)), 1)

@app.route('/', methods=['GET', 'POST'])
def index():
    context = {
        'essay': '',
        'grammar_score': None,
        'lexical_score': None,
        'global_organization_score': None,
        'local_organization_score': None,
        'supporting_ideas_score': None,
        'holistic_score': None,
        'grammar_score2': None,
        'spelling_score': None,
        'word_diversity_score': None,
        'essay_quality_score': None
    }

    if request.method == 'POST':
        essay = request.form['essay']
        context['essay'] = essay

        # Website 1 predictions
        predictions_website1 = get_predictions_website1([essay])
        if predictions_website1 and len(predictions_website1[0]) >= 6:
            context.update({
                'grammar_score': predictions_website1[0][0],
                'lexical_score': predictions_website1[0][1],
                'global_organization_score': predictions_website1[0][2],
                'local_organization_score': predictions_website1[0][3],
                'supporting_ideas_score': predictions_website1[0][4],
                'holistic_score': min(5.0, predictions_website1[0][5])
            })

        # Website 2 predictions
        context['grammar_score2'] = calculate_grammar_score(essay)
        context['spelling_score'] = calculate_spelling_score(essay)
        context['word_diversity_score'] = calculate_word_diversity(essay)
        
        # Calculate overall quality score
        if context['holistic_score'] and context['grammar_score2']:
            context['essay_quality_score'] = round(
                (context['holistic_score'] * 2 + context['grammar_score2']) / 3, 
                1
            )

    return render_template('index.html', **context)

if __name__ == '__main__':
    port = int(os.environ.get('PORT', 7860))
    app.run(host='0.0.0.0', port=port)