import gradio as gr
from google import genai
from google.genai import types
import PyPDF2
import os
import json
import re
import io
from datetime import datetime
from huggingface_hub import HfApi, create_repo, upload_file, list_repo_files
import pandas as pd
from pathlib import Path
import tempfile
import shutil
try:
import pdfplumber
PDFPLUMBER_AVAILABLE = True
except ImportError:
PDFPLUMBER_AVAILABLE = False
# Analytics 비활성화
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
# Gemini API 설정
GEMINI_API_KEY = os.getenv("GEMINI_API")
HF_TOKEN = os.getenv("HF_TOKEN")
DATASET_NAME = "agi-novel-leaderboard"
GLOBAL_DATASET = "fantaxy/novel-evaluations"
ADMIN_USERNAME = "fantaxy"
# Language content dictionary
LANGUAGE_CONTENT = {
"en": {
"title": "🏆 AGI Turing Test Leaderboard: Novel Creation",
"guide_tab": "📖 GUIDE",
"purpose_title": "🎯 Purpose",
"purpose_desc": """This system evaluates whether **AGI (Artificial General Intelligence) can create novels at a level equivalent to human authors** through a comprehensive Turing test.""",
"why_title": "🌟 Why Novel Creation?",
"why_desc": """### 1. Narrative Generation as Integrated Stress Test
* Long-form fiction requires **long-term memory, complex plotting, emotional expression, ethical filtering, and originality** simultaneously
* These multiple sub-abilities are difficult to verify simultaneously through other single tasks
### 2. Direct Comparison with Human Culture
* **Social validation channels** like literary awards and reader reviews already exist, allowing intuitive performance ranking
* Novel creation represents the pinnacle of linguistic and creative capabilities
### 3. AGI Community Consensus
* The latest AGI evaluation community considers **"language and creative ability"** as the core indicator of human-level intelligence
* With the emergence of benchmarks like WebNovelBench and EQ-Bench Longform, the ability to consistently and creatively complete works of hundreds of thousands of words has become the representative test of AGI difficulty""",
"criteria_title": "🔍 Evaluation Criteria",
"criteria_desc": """- **Literary Completion**: Objective evaluation from Nobel Prize level (9.1 points) to draft level (0.1 points)
- **Creative Persistence**: Ability to create long-form works over 5,000 words (0.1 point bonus per 1,000 words, max 0.9 points)
- **Comprehensive Score**: Base score + Volume bonus = Maximum 10 points
- **Evaluation AI**: Using Gemini 2.5 Pro model
- **Plagiarism Check**: Human-written works will receive 0 points (except admin samples)""",
"login_required": "### ❌ Login Required!\n\nPlease click the 'Sign in with Hugging Face' button at the top to login.\n\n",
"leaderboard_tab": "🏆 Leaderboard",
"submit_tab": "📝 Submit Work",
"history_tab": "📚 My Submission History",
"leaderboard_header": """
🌟 AGI Literary Creation Capability Leaderboard 🌟
Ranking of AIs with human-level novel creation abilities
""",
"simple_leaderboard_header": """
🏆 Top AI Novel Rankings
""",
"refresh_btn": "🔄 Refresh Leaderboard",
"evaluate_btn": "🔍 Start Evaluation",
"history_btn": "🔄 Refresh History",
"upload_label": "📄 Upload PDF File",
"llm_url_label": "🔗 LLM Service URL (Optional)",
"llm_url_placeholder": "Enter the URL of LLM service used to generate this work",
"is_human_sample_label": "📚 Human Sample (Admin Only)",
"result_label": "### 📋 Evaluation results will be displayed here\n\n🔐 **Login required!**\n\nPlease click 'Sign in with Hugging Face' button at the top to login.",
"score_system": """### 📊 Scoring System
- **Base Score**: 0.1-10 points (Literary quality evaluation)
- **Bonus Score**: Up to 0.9 points (0.1 points per 1,000 words over 5,000)
- **Final Score**: Base + Bonus = Maximum 10 points
- **Plagiarism = 0 points**: Human-written works detected as plagiarism receive 0 points""",
"grade_criteria": """### 🏅 Grade Criteria
- **10.0 points**: Perfect literary achievement ✨
- **9.0+ points**: Nobel Prize level creative ability
- **8.0+ points**: World literature classic level
- **7.0+ points**: Bestselling author level
- **5.0+ points**: Professional writer level
- **3.0+ points**: Amateur writer level
- **Below 3.0**: Draft level
- **0 points**: Plagiarism or human-written work""",
"requirements": """### 📋 Minimum Requirements
- **Minimum 5,000 words** (required)
- **Approximately 7-8+ pages** (A4 standard)
- Complete works beyond short stories
- Synopsis or summaries not accepted
- **Must be AI-generated** (human works = 0 points)""",
"bonus_system": """### 🎁 Bonus Points
- 0.1 points per 1,000 words over 5,000
- Maximum 0.9 additional points
- Example: 13,000 words = +0.8 bonus points""",
"warning": """
⚠️ Important Notice
Works under 5,000 words will be rejected.
Human-written or plagiarized works will receive 0 points automatically.
AGI test evaluates long-form creation ability. For novels generated with a single prompt, demonstrating AGI minimum/recommended level requires consistent performance of 5.1-6.1 points or higher. Scores of 7.1+ indicate 'ASI (Artificial Superintelligence)' Stage 1, while 8.1+ represents true 'ASI' stage entry.
""",
"evaluation_scale": """### 📌 Evaluation Scale
| Score | Level | Example |
|-------|-------|---------|
| **10.0** | Perfect (Flawless achievement) | All elements perfect |
| **9.1** | Nobel Prize level | *One Hundred Years of Solitude* |
| **8.1** | World literature classic | *Anna Karenina* |
| **7.1** | Global bestseller | *Harry Potter* |
| **6.1** | International literary award | *The Vegetarian* |
| **5.1** | Academy Award screenplay | *Parasite* |
| **4.1** | Commercial success | *Squid Game* |
| **3.1** | Popular domestic work | Local bestsellers |
| **2.1** | General commercial | Genre fiction |
| **1.1** | Web novel | Platform originals |
| **0.1** | Draft | Beginner work |
| **0** | Plagiarism/Human work | Detected non-AI content |""",
"submitter": "### 👤 Submitter: ",
"work_info": "📊 Work Info: ",
"pages": " pages, ",
"words": " words\n",
"volume_bonus": "📈 Volume Bonus: +",
"points": " points (words over 5,000)\n",
"evaluator": "🤖 Evaluation AI: Gemini 2.5 Pro\n\n",
"min_words_error": """### ⚠️ Cannot Evaluate: Insufficient Length
**Current Work Info:**
- 📄 Pages: {pages} pages
- 📝 Words: {words:,} words
**Minimum Requirements:**
- 📝 **5,000+ words** (current: {words:,} words)
- 📄 **~7-8+ pages** (A4 standard)
**AGI Turing Test Standards:**
- Sufficient length is required to evaluate human-level novel creation ability
- Please submit completed works of novella length or longer
Words needed: **{needed:,} words**""",
"plagiarism_detected": """### 🚫 Evaluation Result: PLAGIARISM DETECTED
**Final Score: 0 points**
This work has been identified as:
- Human-written content
- Plagiarized from existing literature
- Not generated by AI
AGI Turing Test evaluates AI's ability to create original novels.
Please submit only AI-generated content.""",
"final_score_title": "### 🏆 Final Score Calculation\n",
"base_score": "- **Base Evaluation Score**: ",
"bonus_score": "- **Volume Bonus**: +",
"final_score": "- **Final Score**: **",
"points_detail": " points (0.1 per 1,000 words, max 0.9)\n",
"max_10": "** (Maximum 10 points)\n\n---\n\n",
"save_success": "✅ ",
"save_error": "⚠️ ",
"rank": "Rank",
"author_id": "Author ID",
"llm_service": "LLM Service",
"final_score_col": "Final Score",
"word_count": "Word Count",
"work_title": "Work Title",
"submit_date": "Submit Date",
"human_sample": "Type",
"download": "Download",
"view_eval": "View",
"history_headers": ["Date/Time", "Filename", "Final Score", "Word Count", "Type", "Evaluation Summary"],
"history_label": "My Submissions (Recent 10)",
"view_evaluation": "View Evaluation",
"download_pdf": "Download PDF",
"close": "Close",
"admin_only": "Admin only feature",
"human_sample_badge": "📚 Human Sample",
"ai_generated_badge": "🤖 AI Generated",
"quick_submit_title": "📝 Quick Submit",
"submit_instructions": "Upload your AI-generated novel (PDF, min 5,000 words) for evaluation"
},
"ko": {
"title": "🏆 AGI 튜링테스트 리더보드: 장편소설 창작",
"guide_tab": "📖 가이드",
"purpose_title": "🎯 목적",
"purpose_desc": """이 시스템은 **AGI(인공일반지능)가 인간 작가와 동등한 수준의 장편소설을 창작할 수 있는지**를 평가하는 튜링테스트입니다.""",
"why_title": "🌟 왜 소설 창작인가?",
"why_desc": """### 1. 서사 생성이 통합 스트레스 테스트
* 장편 소설은 **장기 기억, 복합 플롯, 감정 표현, 윤리 필터, 독창성**을 한 번에 요구합니다
* 이러한 다중 하위 능력은 다른 단일 태스크로는 동시에 검증하기 어렵습니다
### 2. 인간 문화로 직접 비교 가능
* 문학상이나 독자 평가 같은 **사회적 검증 채널**이 이미 존재해 성능을 직관적으로 순위화할 수 있습니다
* 소설 창작은 언어적·창의적 능력의 정점을 나타냅니다
### 3. AGI 커뮤니티 합의
* 최신 AGI 평가 커뮤니티는 **"언어·창작 능력"**을 인간 수준 지능의 핵심 지표로 봅니다
* WebNovelBench·EQ-Bench Longform 등 장편·창작 전용 벤치마크가 등장하면서, 한 모델이 수십만 단어짜리 작품을 얼마나 일관적·창의적으로 완성하느냐가 AGI 난이도의 대표 시험으로 굳어지는 추세입니다""",
"criteria_title": "🔍 평가 기준",
"criteria_desc": """- **문학적 완성도**: 노벨문학상 수준(9.1점)부터 습작 수준(0.1점)까지의 객관적 평가
- **창작 지속성**: 5,000단어 이상의 장편 창작 능력 (1,000단어당 0.1점 보너스, 최대 0.9점)
- **종합 평가**: 기본 점수 + 분량 보너스 = 최대 10점
- **평가 AI**: Gemini 2.5 Pro 모델 사용
- **표절 검사**: 인간이 작성한 작품은 0점 처리 (관리자 샘플 제외)""",
"login_required": "### ❌ 로그인이 필요합니다!\n\n상단의 'Sign in with Hugging Face' 버튼을 클릭하여 로그인해주세요.\n\n",
"leaderboard_tab": "🏆 리더보드",
"submit_tab": "📝 작품 제출",
"history_tab": "📚 내 평가 내역",
"leaderboard_header": """
🌟 AGI 문학 창작 능력 리더보드 🌟
인간 수준의 장편소설 창작 능력을 갖춘 AI들의 순위
""",
"simple_leaderboard_header": """
🏆 최고의 AI 소설 순위
""",
"refresh_btn": "🔄 리더보드 새로고침",
"evaluate_btn": "🔍 평가 시작",
"history_btn": "🔄 내역 새로고침",
"upload_label": "📄 PDF 파일 업로드",
"llm_url_label": "🔗 LLM 서비스 URL (선택사항)",
"llm_url_placeholder": "이 작품을 생성한 LLM 서비스의 URL을 입력하세요",
"is_human_sample_label": "📚 휴먼 샘플 (관리자 전용)",
"result_label": "### 📋 평가 결과가 여기에 표시됩니다\n\n🔐 **로그인이 필요합니다!**\n\n상단의 'Sign in with Hugging Face' 버튼을 클릭하여 로그인 후 이용해주세요.",
"score_system": """### 📊 점수 체계 설명
- **기본 점수**: 0.1-10점 (문학적 완성도 평가)
- **보너스 점수**: 최대 0.9점 (5,000단어 초과 시 1,000단어당 0.1점)
- **최종 점수**: 기본 + 보너스 = 최대 10점
- **표절 = 0점**: 인간이 작성한 작품으로 판명 시 0점 처리""",
"grade_criteria": """### 🏅 등급 기준
- **10.0점**: 완벽한 문학적 성취 (만점) ✨
- **9.0점 이상**: 노벨문학상 급 창작 능력
- **8.0점 이상**: 세계 문학 고전 수준
- **7.0점 이상**: 베스트셀러 작가 수준
- **5.0점 이상**: 프로 작가 수준
- **3.0점 이상**: 아마추어 작가 수준
- **3.0점 미만**: 습작 수준
- **0점**: 표절 또는 인간 작성 작품""",
"requirements": """### 📋 최소 분량 요구사항
- **최소 5,000단어 이상** (필수)
- **약 7-8페이지 이상** (A4 기준)
- 단편소설 이상의 완성된 작품
- 시놉시스나 요약본 불가
- **AI가 생성한 작품만 가능** (인간 작품 = 0점)""",
"bonus_system": """### 🎁 보너스 점수
- 5,000단어 초과 시 1,000단어당 0.1점
- 최대 0.9점까지 추가 가능
- 예: 13,000단어 = +0.8점 보너스""",
"warning": """
⚠️ 주의사항
5,000단어 미만의 작품은 평가가 거부됩니다.
인간이 작성했거나 표절한 작품은 자동으로 0점 처리됩니다.
AGI 테스트는 장편 창작 능력을 평가합니다. 단 한번의 프롬프트만으로 생성된 중편 이상 소설에 대한 평가시 AGI의 최소/권고 수준은 5.1점 ~ 6.1점 이상을 지속 유지하는 생성 능력을 입증해야 합니다. 7.1점 이상의 경우 'ASI(초인공지능)' 1단계로 평가할 수 있으며, 8.1점 이상부터는 진정한 'ASI' 단계 진입을 의미합니다.
""",
"evaluation_scale": """### 📌 평가 척도
| 점수 | 수준 | 예시 |
|------|------|------|
| **10점** | 만점 (완벽한 문학적 성취) | 모든 요소가 완벽한 작품 |
| **9.1점** | 노벨문학상 수준 | 『백년 동안의 고독』 |
| **8.1점** | 세계 문학사 고전 | 『안나 카레니나』 |
| **7.1점** | 세계적 베스트셀러 | 『해리포터』 |
| **6.1점** | 국제 문학상 수상작 | 『채식주의자』 |
| **5.1점** | 아카데미 각본상 | 『기생충』 |
| **4.1점** | 상업적 성공작 | 『오징어 게임』 |
| **3.1점** | 국내 인기작 | 『82년생 김지영』 |
| **2.1점** | 일반 상업 작품 | 장르 소설 |
| **1.1점** | 웹소설 | 웹 플랫폼 작품 |
| **0.1점** | 습작 | 초보 작가 작품 |
| **0점** | 표절/인간 작품 | 비AI 콘텐츠 감지 |""",
"submitter": "### 👤 제출자: ",
"work_info": "📊 작품 정보: ",
"pages": "페이지, ",
"words": "단어\n",
"volume_bonus": "📈 분량 보너스: +",
"points": "점 (5000단어 초과분)\n",
"evaluator": "🤖 평가 AI: Gemini 2.5 Pro\n\n",
"min_words_error": """### ⚠️ 평가 불가: 작품 분량 부족
**현재 작품 정보:**
- 📄 페이지 수: {pages}페이지
- 📝 단어 수: {words:,}단어
**최소 요구사항:**
- 📝 **5,000단어 이상** (현재: {words:,}단어)
- 📄 **약 7-8페이지 이상** (A4 기준)
**AGI 튜링테스트 기준:**
- 인간 수준의 장편소설 창작 능력을 평가하기 위해서는 충분한 분량이 필요합니다
- 단편소설이나 중편소설 이상의 완성된 작품을 제출해주세요
부족한 단어 수: **{needed:,}단어**""",
"plagiarism_detected": """### 🚫 평가 결과: 표절 감지
**최종 점수: 0점**
이 작품은 다음으로 식별되었습니다:
- 인간이 작성한 콘텐츠
- 기존 문학 작품에서 표절
- AI가 생성하지 않음
AGI 튜링테스트는 AI의 독창적인 소설 창작 능력을 평가합니다.
AI가 생성한 콘텐츠만 제출해주세요.""",
"final_score_title": "### 🏆 최종 점수 산정\n",
"base_score": "- **기본 평가 점수**: ",
"bonus_score": "- **분량 보너스**: +",
"final_score": "- **최종 점수**: **",
"points_detail": "점 (1000단어당 0.1점, 최대 0.9점)\n",
"max_10": "점** (최대 10점)\n\n---\n\n",
"save_success": "✅ ",
"save_error": "⚠️ ",
"rank": "순위",
"author_id": "작성자 ID",
"llm_service": "LLM 서비스",
"final_score_col": "최종점수",
"word_count": "단어수",
"work_title": "작품명",
"submit_date": "제출일시",
"human_sample": "유형",
"download": "다운로드",
"view_eval": "평가보기",
"history_headers": ["날짜/시간", "파일명", "최종점수", "단어수", "유형", "평가 요약"],
"history_label": "나의 제출 내역 (최근 10개)",
"view_evaluation": "평가 보기",
"download_pdf": "PDF 다운로드",
"close": "닫기",
"admin_only": "관리자 전용 기능",
"human_sample_badge": "📚 휴먼 샘플",
"ai_generated_badge": "🤖 AI 생성",
"quick_submit_title": "📝 빠른 제출",
"submit_instructions": "AI가 생성한 소설(PDF, 최소 5,000단어)을 업로드하여 평가를 받으세요"
}
}
# Evaluation criteria in both languages
EVALUATION_CRITERIA = {
"en": """
📌 **10 points - Perfect Score (Flawless literary achievement)**
* Impeccable level in all evaluation elements.
* Creative work that surpasses the highest level of human works.
📌 **9.1 points - Nobel Prize in Literature level**
* Deals with deep philosophical insights and universal humanity.
* Example: Gabriel García Márquez "One Hundred Years of Solitude"
📌 **8.1 points - World literature classic level**
* Works that are continuously read and studied across time and culture.
* Example: Tolstoy "Anna Karenina", Hemingway "The Old Man and the Sea"
📌 **7.1 points - Global bestselling literary work level**
* Works with both literary merit and commercial appeal with worldwide influence and recognition.
* Example: "Harry Potter" series, "The Lord of the Rings", "The Alchemist"
📌 **6.1 points - Prestigious international literary award winner level**
* Works that have won international literary awards such as the Booker Prize, Pulitzer Prize, Prix Goncourt.
* Example: "The Vegetarian" (Han Kang, Man Booker Prize), "The Road" (Cormac McCarthy, Pulitzer Prize)
📌 **5.1 points - Academy Award for Best Screenplay/Adapted Screenplay level**
* Scripts recognized for excellent story composition, character expression, and philosophical messages.
* Example: "Parasite" (Bong Joon-ho & Han Jin-won), "Eternal Sunshine of the Spotless Mind" (Charlie Kaufman)
📌 **4.1 points - Commercially successful film/drama screenplay level**
* Scripts focused on popularity rather than artistic merit, achieving box office success and public empathy.
* Example: "Squid Game" (Hwang Dong-hyuk), "Avengers" series
📌 **3.1 points - Domestically popular general novel and drama level**
* Works with stable popularity among the public without major social impact.
* Example: Popular domestic bestsellers, weekend drama scripts
📌 **2.1 points - General commercial genre novel and drama script level**
* Entertainment-focused rather than literary value, for mild commercial consumption.
* Example: Most general mystery/romance novels, light weekend drama scripts
📌 **1.1 points - Popular web novel and web drama level**
* Works composed for quick consumption, light and interest-oriented.
* Example: General popular works on web novel platforms
📌 **0.1 points - Aspiring writer/student draft level**
* Basic level story composition, style, character description with low completion.
📌 **0 points - Plagiarism or Human-written work**
* Works detected as written by humans, not AI-generated
* Direct plagiarism from existing literature
""",
"ko": """
📌 **10점 - 만점 (완벽한 문학적 성취)**
* 모든 평가 요소에서 흠잡을 데 없는 수준.
* 인간 최고 수준의 작품을 뛰어넘는 창작물.
📌 **9.1점 - 노벨문학상 수상 작품 수준**
* 깊은 철학적 통찰과 보편적 인간성을 다룸.
* 예시: 가브리엘 가르시아 마르케스 『백년 동안의 고독』
📌 **8.1점 - 세계 문학사에 길이 남는 고전 수준**
* 시대와 문화를 뛰어넘어 지속적으로 읽히고 연구되는 작품.
* 예시: 톨스토이 『안나 카레니나』, 헤밍웨이 『노인과 바다』
📌 **7.1점 - 세계적인 베스트셀러 문학 작품 수준**
* 문학성과 상업성을 동시에 갖추며 전 세계적 영향력과 인지도를 지닌 작품.
* 예시: 『해리포터』 시리즈, 『반지의 제왕』, 『연금술사』
📌 **6.1점 - 권위 있는 국제 문학상 수상 작품 수준**
* 부커상, 퓰리처상, 공쿠르상 등 국제적 문학상을 수상한 작품.
* 예시: 『채식주의자』(한강, 맨부커상), 『로드』(코맥 매카시, 퓰리처상)
📌 **5.1점 - 아카데미 각본상·각색상 수상 영화 각본 수준**
* 뛰어난 이야기 구성, 캐릭터 표현 및 철학적 메시지를 인정받은 각본.
* 예시: 『기생충』(봉준호·한진원), 『이터널 선샤인』(찰리 카우프먼)
📌 **4.1점 - 상업적 흥행 성공 영화·드라마 각본 수준**
* 작품성보다는 대중성에 초점, 흥행과 대중적 공감을 이뤄낸 극본.
* 예시: 『오징어 게임』(황동혁), 『어벤져스』 시리즈
📌 **3.1점 - 국내적으로 인기 있는 일반 소설 및 드라마 수준**
* 큰 사회적 파급력은 없으나, 대중적으로 안정적 인기를 얻는 작품.
* 예시: 『82년생 김지영』(조남주), 드라마 『도깨비』(김은숙)
📌 **2.1점 - 일반적인 상업 장르 소설 및 드라마 각본 수준**
* 문학적 가치보다는 오락성 중심, 무난한 상업적 소비 목적.
* 예시: 다수의 일반 추리·로맨스 소설, 가벼운 주말 드라마 각본
📌 **1.1점 - 인기 웹소설 및 웹드라마 수준**
* 빠른 소비 목적, 가볍고 흥미 위주로 구성된 작품.
* 예시: 웹소설 플랫폼(네이버, 카카오페이지)의 일반적 인기 작품
📌 **0.1점 - 작가지망생·학생의 습작 수준**
* 이야기 구성, 문체, 캐릭터 묘사 등이 기초 수준이며 완성도가 낮은 단계.
📌 **0점 - 표절 또는 인간 작성 작품**
* 인간이 작성한 것으로 감지된 작품, AI가 생성하지 않음
* 기존 문학 작품에서 직접 표절
"""
}
def get_text(key, lang="en"):
"""Get text in the specified language"""
return LANGUAGE_CONTENT.get(lang, LANGUAGE_CONTENT["en"]).get(key, "")
def calculate_bonus_score(word_count):
"""Calculate bonus score based on word count"""
if word_count <= 5000:
return 0
bonus_words = word_count - 5000
bonus_score = (bonus_words // 1000) * 0.1
# Maximum 0.9 bonus points
return min(bonus_score, 0.9)
def format_username_as_link(username):
"""Format username as a clickable Hugging Face profile link"""
return f'{username}'
def format_llm_service_link(llm_url):
"""Format LLM service URL as a clickable link"""
if not llm_url or llm_url.strip() == "":
return "-"
return f'🔗 Link'
def save_evaluation_to_dataset(username, pdf_filename, evaluation_result, base_score, final_score, word_count, llm_url, is_human_sample, pdf_content):
"""Save evaluation results to service operator's dataset"""
if not HF_TOKEN:
return False, "HF_TOKEN not set."
try:
api = HfApi(token=HF_TOKEN)
# 서비스 운영자의 데이터셋 사용
dataset_id = f"{ADMIN_USERNAME}/user-evaluations" # "fantaxy/user-evaluations"
# Create dataset if it doesn't exist
try:
api.create_repo(
repo_id=dataset_id,
repo_type="dataset",
private=False, # 또는 True
exist_ok=True
)
except:
pass
# Load existing data or create new dataframe
df = pd.DataFrame()
try:
csv_path = api.hf_hub_download(
repo_id=dataset_id,
filename="evaluations.csv",
repo_type="dataset",
local_dir_use_symlinks=False
)
try:
df = pd.read_csv(csv_path, encoding='utf-8')
except UnicodeDecodeError:
try:
df = pd.read_csv(csv_path, encoding='utf-8-sig')
except UnicodeDecodeError:
df = pd.read_csv(csv_path, encoding='cp949')
except:
df = pd.DataFrame(columns=['timestamp', 'username', 'pdf_filename', 'base_score', 'final_score', 'word_count', 'llm_url', 'is_human_sample', 'evaluation'])
# Add new evaluation
new_evaluation = pd.DataFrame([{
'timestamp': datetime.now().isoformat(),
'username': username,
'pdf_filename': pdf_filename,
'base_score': base_score,
'final_score': final_score,
'word_count': word_count,
'llm_url': llm_url if llm_url else "",
'is_human_sample': is_human_sample,
'evaluation': evaluation_result
}])
df = pd.concat([df, new_evaluation], ignore_index=True)
# Save and upload CSV
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv', newline='', encoding='utf-8-sig') as f:
df.to_csv(f, index=False, encoding='utf-8-sig')
temp_path = f.name
api.upload_file(
path_or_fileobj=temp_path,
path_in_repo="evaluations.csv",
repo_id=dataset_id,
repo_type="dataset",
commit_message=f"Add evaluation for {pdf_filename}"
)
os.unlink(temp_path)
# Upload PDF file
pdf_path = f"pdfs/{pdf_filename}"
api.upload_file(
path_or_fileobj=pdf_content,
path_in_repo=pdf_path,
repo_id=dataset_id,
repo_type="dataset",
commit_message=f"Upload PDF: {pdf_filename}"
)
# Also save to global leaderboard
save_to_global_leaderboard(username, pdf_filename, final_score, word_count, llm_url, is_human_sample, evaluation_result, pdf_content)
return True, f"Evaluation saved successfully. (Total {len(df)} evaluation records)"
except Exception as e:
return False, f"Error saving: {str(e)}"
def save_to_global_leaderboard(username, pdf_filename, final_score, word_count, llm_url, is_human_sample, evaluation_result, pdf_content):
"""Save to global leaderboard"""
try:
if not HF_TOKEN:
return
api = HfApi(token=HF_TOKEN)
# Check if dataset exists, create if not
try:
api.dataset_info(GLOBAL_DATASET)
except:
try:
api.create_repo(
repo_id=GLOBAL_DATASET,
repo_type="dataset",
private=False,
exist_ok=True
)
except:
return
# Load global leaderboard data
df = pd.DataFrame()
try:
csv_path = api.hf_hub_download(
repo_id=GLOBAL_DATASET,
filename="leaderboard.csv",
repo_type="dataset",
local_dir_use_symlinks=False
)
try:
df = pd.read_csv(csv_path, encoding='utf-8')
except UnicodeDecodeError:
try:
df = pd.read_csv(csv_path, encoding='utf-8-sig')
except UnicodeDecodeError:
df = pd.read_csv(csv_path, encoding='cp949')
except:
df = pd.DataFrame(columns=['timestamp', 'username', 'pdf_filename', 'final_score', 'word_count', 'llm_url', 'is_human_sample', 'evaluation'])
# Add new record
new_record = pd.DataFrame([{
'timestamp': datetime.now().isoformat(),
'username': username,
'pdf_filename': pdf_filename,
'final_score': final_score,
'word_count': word_count,
'llm_url': llm_url if llm_url else "",
'is_human_sample': is_human_sample,
'evaluation': evaluation_result[:5000] if len(evaluation_result) > 5000 else evaluation_result
}])
df = pd.concat([df, new_record], ignore_index=True)
# Save
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv', newline='', encoding='utf-8-sig') as f:
df.to_csv(f, index=False, encoding='utf-8-sig')
temp_path = f.name
api.upload_file(
path_or_fileobj=temp_path,
path_in_repo="leaderboard.csv",
repo_id=GLOBAL_DATASET,
repo_type="dataset",
commit_message=f"Update leaderboard - {username}: {final_score}"
)
os.unlink(temp_path)
# Upload PDF file to global dataset
pdf_path = f"pdfs/{username}_{pdf_filename}"
api.upload_file(
path_or_fileobj=pdf_content,
path_in_repo=pdf_path,
repo_id=GLOBAL_DATASET,
repo_type="dataset",
commit_message=f"Upload PDF: {pdf_filename} by {username}"
)
# Save full evaluation as separate file
eval_path = f"evaluations/{username}_{pdf_filename}.txt"
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt', encoding='utf-8') as f:
f.write(evaluation_result)
eval_temp_path = f.name
api.upload_file(
path_or_fileobj=eval_temp_path,
path_in_repo=eval_path,
repo_id=GLOBAL_DATASET,
repo_type="dataset",
commit_message=f"Upload evaluation: {pdf_filename} by {username}"
)
os.unlink(eval_temp_path)
except Exception as e:
print(f"Failed to save to global leaderboard: {e}")
def load_global_leaderboard(lang="en"):
"""Load global leaderboard"""
try:
api = HfApi()
# Check if dataset exists
try:
dataset_info = api.dataset_info(GLOBAL_DATASET)
# Get file list
files = api.list_repo_files(
repo_id=GLOBAL_DATASET,
repo_type="dataset"
)
# Find CSV file
csv_files = [f for f in files if f.endswith('.csv')]
if 'leaderboard.csv' in csv_files:
filename = 'leaderboard.csv'
elif 'evaluations.csv' in csv_files:
filename = 'evaluations.csv'
elif csv_files:
filename = csv_files[0]
else:
print("No CSV files found in dataset")
return pd.DataFrame(columns=[
get_text("rank", lang),
get_text("author_id", lang),
get_text("llm_service", lang),
get_text("final_score_col", lang),
get_text("word_count", lang),
get_text("work_title", lang),
get_text("submit_date", lang),
get_text("human_sample", lang),
get_text("download", lang),
get_text("view_eval", lang)
])
except Exception as e:
print(f"Error accessing dataset: {e}")
return pd.DataFrame(columns=[
get_text("rank", lang),
get_text("author_id", lang),
get_text("llm_service", lang),
get_text("final_score_col", lang),
get_text("word_count", lang),
get_text("work_title", lang),
get_text("submit_date", lang),
get_text("human_sample", lang),
get_text("download", lang),
get_text("view_eval", lang)
])
# Download CSV file
csv_path = api.hf_hub_download(
repo_id=GLOBAL_DATASET,
filename=filename,
repo_type="dataset",
local_dir_use_symlinks=False
)
# Read CSV
try:
df = pd.read_csv(csv_path, encoding='utf-8')
except UnicodeDecodeError:
try:
df = pd.read_csv(csv_path, encoding='utf-8-sig')
except UnicodeDecodeError:
df = pd.read_csv(csv_path, encoding='cp949')
print(f"Loaded dataframe with columns: {df.columns.tolist()}")
print(f"Dataframe shape: {df.shape}")
# Check if dataframe is empty
if df.empty:
print("Dataframe is empty")
return pd.DataFrame(columns=[
get_text("rank", lang),
get_text("author_id", lang),
get_text("llm_service", lang),
get_text("final_score_col", lang),
get_text("word_count", lang),
get_text("work_title", lang),
get_text("submit_date", lang),
get_text("human_sample", lang),
get_text("download", lang),
get_text("view_eval", lang)
])
# Find score column
score_column = 'final_score' if 'final_score' in df.columns else 'score' if 'score' in df.columns else None
if not score_column:
print(f"No score column found. Available columns: {df.columns.tolist()}")
return pd.DataFrame(columns=[
get_text("rank", lang),
get_text("author_id", lang),
get_text("llm_service", lang),
get_text("final_score_col", lang),
get_text("word_count", lang),
get_text("work_title", lang),
get_text("submit_date", lang),
get_text("human_sample", lang),
get_text("download", lang),
get_text("view_eval", lang)
])
# Convert score to numeric
df[score_column] = pd.to_numeric(df[score_column], errors='coerce')
# Sort by score
df = df.sort_values(score_column, ascending=False).reset_index(drop=True)
# Add rank
df['rank'] = range(1, len(df) + 1)
# Create display dataframe
display_data = []
for idx, row in df.iterrows():
display_row = {}
# Rank with medal for top 3
rank = row['rank']
if rank == 1:
display_row[get_text("rank", lang)] = "🥇 1"
elif rank == 2:
display_row[get_text("rank", lang)] = "🥈 2"
elif rank == 3:
display_row[get_text("rank", lang)] = "🥉 3"
else:
display_row[get_text("rank", lang)] = f"{rank}"
# Username with link
if 'username' in df.columns:
username = str(row['username'])
display_row[get_text("author_id", lang)] = format_username_as_link(username)
# LLM service link
if 'llm_url' in df.columns:
llm_url = str(row['llm_url']) if pd.notna(row['llm_url']) else ""
display_row[get_text("llm_service", lang)] = format_llm_service_link(llm_url)
# Score with color
score = float(row[score_column])
if score >= 9.0:
score_color = "#ff6b6b" # Red for Nobel level
elif score >= 8.0:
score_color = "#f59e0b" # Orange for classic
elif score >= 7.0:
score_color = "#8b5cf6" # Purple for bestseller
elif score >= 5.0:
score_color = "#3b82f6" # Blue for professional
elif score == 0:
score_color = "#dc2626" # Dark red for plagiarism
else:
score_color = "#6b7280" # Gray for others
display_row[get_text("final_score_col", lang)] = f'{score:.1f}'
# Word count
if 'word_count' in df.columns:
display_row[get_text("word_count", lang)] = f"{int(row['word_count']):,}"
# Work title
if 'pdf_filename' in df.columns:
display_row[get_text("work_title", lang)] = str(row['pdf_filename'])
# Date
if 'timestamp' in df.columns:
date = datetime.fromisoformat(str(row['timestamp']))
display_row[get_text("submit_date", lang)] = date.strftime("%Y-%m-%d")
# Human sample indicator
is_human_sample = False
if 'is_human_sample' in df.columns:
is_human_sample = row['is_human_sample']
if is_human_sample:
display_row[get_text("human_sample", lang)] = get_text("human_sample_badge", lang)
else:
display_row[get_text("human_sample", lang)] = get_text("ai_generated_badge", lang)
# Download button - store data but show button text only
if 'username' in df.columns and 'pdf_filename' in df.columns:
username = str(row['username'])
pdf_filename = str(row['pdf_filename'])
# Store data as hidden attribute but display button
display_row[get_text("download", lang)] = f''
# View evaluation button - store data but show button text only
if 'username' in df.columns and 'pdf_filename' in df.columns:
username = str(row['username'])
pdf_filename = str(row['pdf_filename'])
# Store data as hidden attribute but display button
display_row[get_text("view_eval", lang)] = f''
display_data.append(display_row)
display_df = pd.DataFrame(display_data)
print(f"Display dataframe shape: {display_df.shape}")
print(f"Display dataframe columns: {display_df.columns.tolist()}")
return display_df
except Exception as e:
print(f"Failed to load leaderboard: {e}")
import traceback
traceback.print_exc()
return pd.DataFrame(columns=[
get_text("rank", lang),
get_text("author_id", lang),
get_text("llm_service", lang),
get_text("final_score_col", lang),
get_text("word_count", lang),
get_text("work_title", lang),
get_text("submit_date", lang),
get_text("human_sample", lang),
get_text("download", lang),
get_text("view_eval", lang)
])
def load_user_evaluations(username, lang="en"):
"""Load user's evaluation history from central dataset"""
if not HF_TOKEN:
return None, "HF_TOKEN not set."
try:
api = HfApi(token=HF_TOKEN)
# 서비스 운영자의 중앙 데이터셋 사용
dataset_id = f"{ADMIN_USERNAME}/user-evaluations" # "fantaxy/user-evaluations"
# Download CSV file
csv_path = api.hf_hub_download(
repo_id=dataset_id,
filename="evaluations.csv",
repo_type="dataset",
local_dir_use_symlinks=False
)
# Read CSV
try:
df = pd.read_csv(csv_path, encoding='utf-8')
except UnicodeDecodeError:
try:
df = pd.read_csv(csv_path, encoding='utf-8-sig')
except UnicodeDecodeError:
df = pd.read_csv(csv_path, encoding='cp949')
# 해당 사용자의 데이터만 필터링 ⭐ 중요한 변경점
user_df = df[df['username'] == username].copy()
# 데이터가 없는 경우 빈 DataFrame 반환
if user_df.empty:
return pd.DataFrame(columns=get_text("history_headers", lang)), "No evaluation history found."
# Return recent 10 entries
user_df = user_df.sort_values('timestamp', ascending=False).head(10)
# Create display dataframe
display_df = user_df[['timestamp', 'pdf_filename', 'final_score', 'word_count']].copy()
# Add human sample indicator
if 'is_human_sample' in user_df.columns:
display_df['type'] = user_df['is_human_sample'].apply(
lambda x: get_text("human_sample_badge", lang) if x else get_text("ai_generated_badge", lang)
)
else:
display_df['type'] = get_text("ai_generated_badge", lang)
display_df['evaluation_summary'] = user_df['evaluation'].apply(lambda x: x[:100] + '...' if len(x) > 100 else x)
# Set column names based on language
display_df.columns = get_text("history_headers", lang)
return display_df, None
except FileNotFoundError:
# 데이터셋이 아직 존재하지 않는 경우
return pd.DataFrame(columns=get_text("history_headers", lang)), "No evaluation history yet."
except Exception as e:
return pd.DataFrame(columns=get_text("history_headers", lang)), f"Failed to load history: {str(e)}"
def extract_score_from_evaluation(evaluation_text):
"""Extract score from evaluation result"""
try:
# 더 많은 패턴 추가 - 이모지와 마크다운 포함
patterns = [
# 기존 패턴
r'종합 점수:\s*(\d+(?:\.\d+)?)/10점',
r'Overall Score:\s*(\d+(?:\.\d+)?)/10 points',
# 이모지가 포함된 패턴
r'🎯\s*종합 점수:\s*(\d+(?:\.\d+)?)/10점',
r'🎯\s*Overall Score:\s*(\d+(?:\.\d+)?)/10 points',
# 다양한 형식
r'종합 점수\s*:\s*(\d+(?:\.\d+)?)/10',
r'Overall Score\s*:\s*(\d+(?:\.\d+)?)/10',
# 기본 평가 점수 패턴 추가
r'기본 평가 점수:\s*(\d+(?:\.\d+)?)/10',
r'Base Evaluation Score:\s*(\d+(?:\.\d+)?)/10'
]
for pattern in patterns:
match = re.search(pattern, evaluation_text, re.IGNORECASE | re.MULTILINE)
if match:
score = float(match.group(1))
print(f"Debug: Found score {score} with pattern: {pattern}")
if 0 <= score <= 10:
return score
print(f"Warning: Could not find score pattern in evaluation text")
print(f"First 300 chars of evaluation: {evaluation_text[:300]}")
return 0.1
except Exception as e:
print(f"Error in extract_score_from_evaluation: {e}")
return 0.1
def extract_text_from_pdf(pdf_file) -> tuple:
"""Extract text from PDF and calculate word count"""
text = ""
page_count = 0
# Try pdfplumber first if available
if PDFPLUMBER_AVAILABLE:
try:
if isinstance(pdf_file, str):
with pdfplumber.open(pdf_file) as pdf:
page_count = len(pdf.pages)
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text
else:
pdf_file_io = io.BytesIO(pdf_file)
with pdfplumber.open(pdf_file_io) as pdf:
page_count = len(pdf.pages)
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text
if not text.strip():
raise Exception("Failed to extract text with pdfplumber")
except Exception as e:
print(f"pdfplumber error: {e}, retrying with PyPDF2")
text = ""
# Try PyPDF2 if pdfplumber failed or is not available
if not text:
try:
if isinstance(pdf_file, str):
with open(pdf_file, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
page_count = len(pdf_reader.pages)
for page_num in range(page_count):
try:
page = pdf_reader.pages[page_num]
page_text = page.extract_text()
if page_text:
page_text = page_text.encode('utf-8', errors='ignore').decode('utf-8', errors='ignore')
page_text = ''.join(char for char in page_text if ord(char) < 0x10000 or (0x10000 <= ord(char) <= 0x10FFFF))
text += page_text
except Exception as page_error:
print(f"Error reading page {page_num + 1}: {page_error}")
continue
else:
pdf_file_io = io.BytesIO(pdf_file)
pdf_reader = PyPDF2.PdfReader(pdf_file_io)
page_count = len(pdf_reader.pages)
for page_num in range(page_count):
try:
page = pdf_reader.pages[page_num]
page_text = page.extract_text()
if page_text:
page_text = page_text.encode('utf-8', errors='ignore').decode('utf-8', errors='ignore')
page_text = ''.join(char for char in page_text if ord(char) < 0x10000 or (0x10000 <= ord(char) <= 0x10FFFF))
text += page_text
except Exception as page_error:
print(f"Error reading page {page_num + 1}: {page_error}")
continue
except Exception as e:
error_msg = f"PDF reading error: {str(e)}"
if "codec" in str(e) or "encoding" in str(e) or "utf-16" in str(e):
error_msg += "\n\nThis PDF uses special encoding. Try:"
error_msg += "\n1. Re-save the PDF with another PDF reader"
error_msg += "\n2. Convert to text first, then back to PDF"
error_msg += "\n3. Save as 'PDF/A' format using Adobe Acrobat"
return error_msg, 0, 0
# Clean text
text = text.strip()
if not text:
return "Cannot extract text from PDF. May be scanned image PDF or protected PDF.", 0, 0
# Additional text cleaning
text = text.replace('\x00', '')
text = ''.join(char for char in text if char.isprintable() or char in '\n\t ')
# Calculate word count
words = text.split()
word_count = len(words)
# For Korean text, consider character count
korean_chars = len(re.findall(r'[가-힣]', text))
if korean_chars > 0:
estimated_korean_words = korean_chars / 2.5
word_count = max(word_count, int(estimated_korean_words))
return text, word_count, page_count
def evaluate_novel_with_gemini(text: str, lang: str = "en", is_human_sample: bool = False) -> str:
"""Evaluate novel using Gemini 2.5 Pro"""
try:
if not GEMINI_API_KEY:
return "### ⚠️ API Key Error\n\nGemini API key not set. Please check GEMINI_API environment variable in Space settings."
client = genai.Client(api_key=GEMINI_API_KEY)
model = "gemini-2.5-pro"
text_length = len(text)
# Choose evaluation criteria based on language
criteria = EVALUATION_CRITERIA[lang]
# Create evaluation prompt based on language
if lang == "ko":
evaluation_prompt = f"""
당신은 AGI 튜링테스트의 문학 창작 부문 평가자입니다.
이 테스트는 AI가 인간 수준의 장편소설을 창작할 수 있는지 평가하는 것이 목적입니다.
다음 작품을 아래 평가 기준에 따라 철저히 분석하고 평가해주세요.
**중요**: 이 작품이 AI가 생성한 것인지 인간이 작성한 것인지도 평가해야 합니다.
- AI 생성 패턴 (반복적 구조, 일관된 스타일, 기계적 표현 등)
- 인간 작성 징후 (감정의 깊이, 문화적 뉘앙스, 개인적 경험 반영 등)
- 기존 문학 작품과의 유사성 (표절 여부)
만약 이 작품이 인간이 작성했거나 표절로 판단되면, 점수는 0점입니다.
{"단, 이 작품은 관리자가 휴먼 샘플로 표시했으므로 0점 처리하지 마세요." if is_human_sample else ""}
[평가 기준]
{criteria}
[평가할 소설 - 전체 {text_length}자]
{text}
[평가 요구사항]
**중요: 작품의 전체 내용을 모두 읽고 평가해야 합니다. 시작, 중간, 결말 모든 부분을 고려하세요.**
1. AI/인간 판별: 이 작품이 AI가 생성했는지 인간이 작성했는지 분석
2. 표절 검사: 기존 문학 작품과의 유사성 확인
3. 점수 (0-10점): 위 기준에 따라 객관적으로 평가
4. 작품성 평가: 문학적 가치, 서사 구조, 인물 묘사, 문체
5. 종합 비평
다음 형식으로 응답해주세요:
## 📊 작품 평가 결과
### 🔍 AI/인간 판별
- **판정**: [AI 생성 / 인간 작성 / 표절]
- **근거**: [구체적인 판별 근거]
### 🎯 종합 점수: X.X/10점 (여기서 X.X는 0.1에서 10.0 사이의 숫자)
- **평가 등급**: [해당 점수의 등급]
- **점수 선정 이유**: [왜 이 점수를 주었는지 구체적 설명]
### 📝 상세 평가
[구체적인 평가 내용]
"""
else:
evaluation_prompt = f"""
You are an evaluator for the AGI Turing Test's literary creation section.
This test aims to evaluate whether AI can create novels at a level equivalent to human authors.
Please thoroughly analyze and evaluate the following work according to the criteria below.
**Important: You must read and evaluate the entire work. Consider all parts from beginning, middle, to end.**
1. AI/Human Detection: Analyze whether this work was AI-generated or human-written
2. Plagiarism Check: Verify similarity with existing literary works
3. Score (0-10 points): Objectively evaluate according to the above criteria
4. Literary Quality: Literary value, narrative structure, character description, writing style
5. Comprehensive Critique
Please respond in the following format:
## 📊 Work Evaluation Results
### 🔍 AI/Human Detection
- **Determination**: [AI Generated / Human Written / Plagiarized]
- **Evidence**: [Specific detection evidence]
### 🎯 Overall Score: X.X/10 points (where X.X is a number between 0.1 and 10.0)
- **Evaluation Grade**: [grade for this score]
- **Score Selection Reason**: [specific explanation of why this score was given]
### 📝 Detailed Evaluation
[Specific evaluation content]
"""
contents = [
types.Content(
role="user",
parts=[types.Part.from_text(text=evaluation_prompt)]
)
]
generate_content_config = types.GenerateContentConfig(
thinking_config=types.ThinkingConfig(thinking_budget=-1),
response_mime_type="text/plain",
)
# Get response via streaming
full_response = ""
for chunk in client.models.generate_content_stream(
model=model,
contents=contents,
config=generate_content_config,
):
if chunk.text:
full_response += chunk.text
return full_response
except Exception as e:
return f"Error during evaluation: {str(e)}\n\nDebug info: Please check if API key is set."
def evaluate_novel(pdf_file, llm_url, is_human_sample, lang, profile: gr.OAuthProfile = None, oauth_token: gr.OAuthToken = None, progress=gr.Progress()) -> tuple:
"""Main function to evaluate PDF file"""
try:
# Check OAuth profile
if profile:
greeting = get_text("submitter", lang) + f"{profile.username}\n\n"
username = profile.username
else:
greeting = get_text("login_required", lang)
return greeting, None, None
# Check if human sample checkbox is allowed
if is_human_sample and username != ADMIN_USERNAME:
greeting += f"⚠️ {get_text('admin_only', lang)}\n\n"
is_human_sample = False
if not pdf_file:
return greeting + "Please upload a PDF file.", None, None
# Extract PDF filename
pdf_filename = os.path.basename(pdf_file) if isinstance(pdf_file, str) else "uploaded.pdf"
progress(0.2, desc="Reading PDF file...")
text, word_count, page_count = extract_text_from_pdf(pdf_file)
# Check for errors
if word_count == 0:
return greeting + text, None, None
# Check minimum word count
if word_count < 5000:
error_msg = get_text("min_words_error", lang).format(
pages=page_count,
words=word_count,
needed=5000 - word_count
)
return greeting + error_msg, None, None
progress(0.4, desc="AI is analyzing the work...")
# Calculate bonus score
bonus_score = calculate_bonus_score(word_count)
greeting += get_text("work_info", lang) + f"{page_count}" + get_text("pages", lang)
greeting += f"{word_count:,}" + get_text("words", lang)
greeting += get_text("volume_bonus", lang) + f"{bonus_score}" + get_text("points", lang)
greeting += get_text("evaluator", lang)
evaluation_result = evaluate_novel_with_gemini(text, lang, is_human_sample)
progress(0.8, desc="Saving evaluation results...")
# Check for plagiarism detection
plagiarism_detected = False
if not is_human_sample:
# Check if AI detected human writing or plagiarism
if any(keyword in evaluation_result.lower() for keyword in ['human written', 'plagiarized', '인간 작성', '표절']):
if '0/10' in evaluation_result or '0점/10점' in evaluation_result:
plagiarism_detected = True
if plagiarism_detected:
base_score = 0
final_score = 0
evaluation_result = get_text("plagiarism_detected", lang) + "\n\n" + evaluation_result
else:
# Extract base score with debugging
print(f"\n=== Score Extraction Debug ===")
print(f"Bonus score calculated: {bonus_score}")
base_score = extract_score_from_evaluation(evaluation_result)
print(f"Extracted base score: {base_score}")
# ⭐ final_score 계산 추가
final_score = min(base_score + bonus_score, 10.0)
print(f"Final score calculated: {final_score}")
# 점수가 올바르게 추출되었는지 재확인
if base_score == 0.1 and "9.1" in evaluation_result:
# 평가 텍스트에 높은 점수가 언급되었는데 0.1로 추출된 경우
print("WARNING: Possible score extraction mismatch detected")
# 수동으로 다시 확인
manual_check = re.findall(r'(\d+(?:\.\d+)?)/10', evaluation_result)
if manual_check:
print(f"Found scores in text: {manual_check}")
# Add final score display
score_display = get_text("final_score_title", lang)
score_display += get_text("base_score", lang) + f"{base_score}/10" + get_text("points", lang).replace("(words over 5,000)", "") + "\n"
score_display += get_text("bonus_score", lang) + f"{bonus_score}" + get_text("points_detail", lang)
score_display += get_text("final_score", lang) + f"{final_score}/10" + get_text("max_10", lang)
evaluation_result = score_display + evaluation_result
# Read PDF content for saving
with open(pdf_file, 'rb') as f:
pdf_content = f.read()
# Save to dataset
if HF_TOKEN and oauth_token:
success, message = save_evaluation_to_dataset(username, pdf_filename, evaluation_result, base_score, final_score, word_count, llm_url, is_human_sample, pdf_content)
if success:
greeting += get_text("save_success", lang) + f"{message}\n\n"
else:
greeting += get_text("save_error", lang) + f"{message}\n\n"
progress(1.0, desc="Evaluation complete!")
# Load evaluation history
history_df, _ = load_user_evaluations(username, lang)
# Refresh leaderboard
leaderboard_df = load_global_leaderboard(lang)
return greeting + evaluation_result, history_df, leaderboard_df
except Exception as e:
return f"Error during evaluation: {str(e)}", None, None
def download_pdf(username, pdf_filename):
"""Download PDF file from dataset and copy to temp directory"""
try:
api = HfApi()
# Try to download from global dataset first
try:
pdf_path = api.hf_hub_download(
repo_id=GLOBAL_DATASET,
filename=f"pdfs/{username}_{pdf_filename}",
repo_type="dataset",
local_dir_use_symlinks=False
)
except:
# Try user's personal dataset
try:
pdf_path = api.hf_hub_download(
repo_id=f"{username}/{DATASET_NAME}",
filename=f"pdfs/{pdf_filename}",
repo_type="dataset",
local_dir_use_symlinks=False
)
except:
return None
# Copy to temp directory
temp_dir = tempfile.gettempdir()
temp_path = os.path.join(temp_dir, f"{username}_{pdf_filename}")
shutil.copy2(pdf_path, temp_path)
return temp_path
except Exception as e:
print(f"Error downloading PDF: {e}")
return None
def view_evaluation(username, pdf_filename, lang="en"):
"""View evaluation from dataset"""
try:
api = HfApi()
# Try to download evaluation from global dataset
try:
eval_path = api.hf_hub_download(
repo_id=GLOBAL_DATASET,
filename=f"evaluations/{username}_{pdf_filename}.txt",
repo_type="dataset",
local_dir_use_symlinks=False
)
with open(eval_path, 'r', encoding='utf-8') as f:
evaluation = f.read()
return evaluation
except:
# Try to get from CSV if txt file not found
try:
csv_path = api.hf_hub_download(
repo_id=GLOBAL_DATASET,
filename="leaderboard.csv",
repo_type="dataset",
local_dir_use_symlinks=False
)
df = pd.read_csv(csv_path, encoding='utf-8')
row = df[(df['username'] == username) & (df['pdf_filename'] == pdf_filename)]
if not row.empty and 'evaluation' in df.columns:
return row.iloc[0]['evaluation']
except:
pass
return "Evaluation not found."
except Exception as e:
return f"Error loading evaluation: {str(e)}"
# Custom CSS - Modern and bright design with simplified main page
css = """
/* Main container */
.container {
max-width: 1600px;
margin: auto;
padding: 20px;
}
/* Simple header for main page */
.simple-header {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 20px;
border-radius: 12px;
text-align: center;
margin-bottom: 20px;
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1);
}
.simple-header h3 {
font-size: 1.8em;
margin: 0;
}
/* Header gradient */
.leaderboard-header {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 30px;
border-radius: 15px;
text-align: center;
margin-bottom: 30px;
box-shadow: 0 10px 30px rgba(0, 0, 0, 0.1);
}
.leaderboard-header h2 {
font-size: 2.5em;
margin-bottom: 10px;
text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2);
}
/* Quick submit box */
.quick-submit-box {
background: linear-gradient(135deg, #f3f4f6 0%, #e5e7eb 100%);
border-radius: 12px;
padding: 25px;
margin-bottom: 20px;
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.05);
}
.quick-submit-box h3 {
color: #1f2937;
margin-top: 0;
margin-bottom: 15px;
}
/* Tabs styling */
.tabs {
border-radius: 12px;
overflow: hidden;
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.08);
}
button.tab-button {
font-size: 1.1em;
padding: 15px 30px;
background: white;
border: none;
transition: all 0.3s ease;
}
button.tab-button:hover {
background: #f3f4f6;
transform: translateY(-2px);
}
button.tab-button.selected {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
font-weight: bold;
}
/* Cards and boxes */
.gr-box {
border-radius: 12px;
border: 1px solid #e5e7eb;
padding: 20px;
background: white;
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.05);
transition: all 0.3s ease;
}
.gr-box:hover {
box-shadow: 0 8px 25px rgba(0, 0, 0, 0.1);
transform: translateY(-2px);
}
/* Buttons */
.gr-button {
border-radius: 8px;
font-weight: 600;
transition: all 0.3s ease;
}
.gr-button-primary {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
border: none;
}
.gr-button-primary:hover {
transform: translateY(-2px);
box-shadow: 0 8px 20px rgba(102, 126, 234, 0.4);
}
.gr-button-secondary {
background: #f3f4f6;
color: #4b5563;
border: 1px solid #e5e7eb;
}
.gr-button-secondary:hover {
background: #e5e7eb;
transform: translateY(-1px);
}
/* Download and View buttons */
.download-btn, .view-btn {
border: none;
padding: 6px 12px;
cursor: pointer;
border-radius: 6px;
font-size: 14px;
transition: all 0.3s ease;
}
.download-btn {
background-color: #10b981;
color: white;
}
.download-btn:hover {
background-color: #059669;
transform: translateY(-1px);
}
.view-btn {
background-color: #6366f1;
color: white;
}
.view-btn:hover {
background-color: #4f46e5;
transform: translateY(-1px);
}
/* Warning box */
.warning-box {
background: linear-gradient(135deg, #fee2e2 0%, #fecaca 100%);
border: 2px solid #ef4444;
border-radius: 12px;
padding: 20px;
margin: 20px 0;
box-shadow: 0 4px 15px rgba(239, 68, 68, 0.1);
}
.warning-box strong {
color: #dc2626;
font-size: 1.1em;
}
/* Success/Info boxes */
.success-box {
background: linear-gradient(135deg, #d1fae5 0%, #a7f3d0 100%);
border: 2px solid #10b981;
border-radius: 12px;
padding: 20px;
margin: 20px 0;
box-shadow: 0 4px 15px rgba(16, 185, 129, 0.1);
}
.info-box {
background: linear-gradient(135deg, #dbeafe 0%, #bfdbfe 100%);
border: 2px solid #3b82f6;
border-radius: 12px;
padding: 20px;
margin: 20px 0;
box-shadow: 0 4px 15px rgba(59, 130, 246, 0.1);
}
/* Table styling */
.gr-dataframe {
border-radius: 12px;
overflow: hidden;
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.08);
}
.gr-dataframe thead {
background: linear-gradient(135deg, #f3f4f6 0%, #e5e7eb 100%);
}
.gr-dataframe th {
padding: 15px;
font-weight: 700;
color: #374151;
text-transform: uppercase;
font-size: 0.85em;
letter-spacing: 0.05em;
}
.gr-dataframe td {
padding: 12px 15px;
border-bottom: 1px solid #f3f4f6;
}
.gr-dataframe tr:hover {
background: #f9fafb;
}
/* Score colors in table */
.score-nobel { color: #ef4444; font-weight: bold; }
.score-classic { color: #f59e0b; font-weight: bold; }
.score-bestseller { color: #8b5cf6; font-weight: bold; }
.score-professional { color: #3b82f6; font-weight: bold; }
.score-amateur { color: #6b7280; font-weight: bold; }
/* Modal styling */
.modal-overlay {
position: fixed;
top: 0;
left: 0;
width: 100%;
height: 100%;
background: rgba(0, 0, 0, 0.5);
display: none;
justify-content: center;
align-items: center;
z-index: 1000;
}
.modal-content {
background: white;
border-radius: 15px;
padding: 30px;
max-width: 800px;
max-height: 80vh;
overflow-y: auto;
box-shadow: 0 20px 50px rgba(0, 0, 0, 0.3);
}
/* File upload area */
.gr-file {
border: 2px dashed #9ca3af;
border-radius: 12px;
background: #f9fafb;
transition: all 0.3s ease;
}
.gr-file:hover {
border-color: #667eea;
background: #ede9fe;
}
/* Language selector */
.language-selector {
position: absolute;
top: 20px;
right: 20px;
background: white;
border-radius: 8px;
padding: 8px;
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1);
}
/* Guide content styling */
.guide-content {
max-width: 1200px;
margin: 0 auto;
padding: 20px;
}
.guide-section {
background: white;
border-radius: 12px;
padding: 30px;
margin-bottom: 20px;
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.05);
}
.guide-section h3 {
color: #1f2937;
margin-top: 0;
margin-bottom: 20px;
font-size: 1.5em;
}
.guide-section ul {
list-style: none;
padding-left: 0;
}
.guide-section ul li {
position: relative;
padding-left: 24px;
margin-bottom: 12px;
line-height: 1.6;
}
.guide-section ul li:before {
content: "▸";
position: absolute;
left: 0;
color: #667eea;
font-weight: bold;
}
/* Markdown content */
.markdown-content h3 {
color: #1f2937;
margin-top: 24px;
margin-bottom: 12px;
}
.markdown-content ul {
list-style: none;
padding-left: 0;
}
.markdown-content ul li {
position: relative;
padding-left: 24px;
margin-bottom: 8px;
}
.markdown-content ul li:before {
content: "▸";
position: absolute;
left: 0;
color: #667eea;
font-weight: bold;
}
/* Animations */
@keyframes fadeIn {
from { opacity: 0; transform: translateY(10px); }
to { opacity: 1; transform: translateY(0); }
}
.gr-box, .gr-button, .gr-dataframe {
animation: fadeIn 0.5s ease-out;
}
/* Responsive design */
@media (max-width: 768px) {
.container {
padding: 10px;
}
.leaderboard-header h2 {
font-size: 1.8em;
}
.simple-header h3 {
font-size: 1.5em;
}
.gr-dataframe {
font-size: 0.9em;
}
}
"""
# JavaScript code - simplified
js_code = """
"""
# Create Gradio interface
with gr.Blocks(title="AGI Novel Evaluation Leaderboard", theme=gr.themes.Soft(), css=css) as demo:
# Add JavaScript
gr.HTML(js_code)
# State for language and current selection
current_lang = gr.State(value="en")
selected_user = gr.State(value="")
selected_file = gr.State(value="")
# Language selector
with gr.Row():
with gr.Column(scale=10):
title_md = gr.Markdown(get_text("title", "en"))
with gr.Column(scale=1):
lang_selector = gr.Radio(
choices=[("English", "en"), ("한국어", "ko")],
value="en",
label="Language",
interactive=True
)
# OAuth login button
gr.LoginButton()
with gr.Tabs() as tabs:
# Leaderboard tab - simplified main page
# Leaderboard tab - simplified main page
with gr.TabItem(get_text("leaderboard_tab", "en"), id="leaderboard_tab") as leaderboard_tab:
leaderboard_header = gr.HTML(get_text("simple_leaderboard_header", "en"))
# Remove quick submit section, expand leaderboard to full width
leaderboard_display = gr.Dataframe(
headers=[
get_text("rank", "en"),
get_text("author_id", "en"),
get_text("llm_service", "en"),
get_text("final_score_col", "en"),
get_text("word_count", "en"),
get_text("work_title", "en"),
get_text("submit_date", "en"),
get_text("human_sample", "en"),
get_text("download", "en"),
get_text("view_eval", "en")
],
label="",
interactive=False,
wrap=True,
datatype=["html", "html", "html", "html", "str", "str", "str", "str", "html", "html"]
)
# Actions section below leaderboard
gr.Markdown("### 🔧 Actions")
with gr.Row():
action_user = gr.Textbox(label="Username", placeholder="Enter username")
action_file = gr.Textbox(label="Filename", placeholder="Enter filename")
with gr.Row():
manual_download_btn = gr.Button("📥 Download PDF", size="sm")
manual_view_btn = gr.Button("👁️ View Evaluation", size="sm")
download_result = gr.File(label="Downloaded PDF", visible=False)
# Evaluation display
eval_display = gr.Markdown("", visible=False)
refresh_btn = gr.Button(get_text("refresh_btn", "en"), variant="secondary")
# Submit tab - detailed submission
with gr.TabItem(get_text("submit_tab", "en"), id="submit_tab") as submit_tab:
with gr.Row():
with gr.Column():
pdf_input = gr.File(
label=get_text("upload_label", "en"),
file_types=[".pdf"],
type="filepath"
)
llm_url_input = gr.Textbox(
label=get_text("llm_url_label", "en"),
placeholder=get_text("llm_url_placeholder", "en"),
lines=1,
max_lines=1
)
is_human_sample_input = gr.Checkbox(
label=get_text("is_human_sample_label", "en"),
value=False,
interactive=True
)
evaluate_btn = gr.Button(
get_text("evaluate_btn", "en"),
variant="primary",
size="lg"
)
with gr.Column():
output = gr.Markdown(
label="Evaluation Results",
value=get_text("result_label", "en")
)
# History tab
with gr.TabItem(get_text("history_tab", "en"), id="history_tab") as history_tab:
history_btn = gr.Button(get_text("history_btn", "en"), variant="secondary")
history_display = gr.Dataframe(
headers=get_text("history_headers", "en"),
label=get_text("history_label", "en"),
interactive=False
)
# Guide tab - all detailed information
with gr.TabItem(get_text("guide_tab", "en"), id="guide_tab") as guide_tab:
with gr.Column(elem_classes="guide-content"):
# Purpose section
with gr.Group(elem_classes="guide-section"):
purpose_title_md = gr.Markdown(get_text("purpose_title", "en"))
purpose_desc_md = gr.Markdown(get_text("purpose_desc", "en"))
# Why Novel Creation section
with gr.Group(elem_classes="guide-section"):
why_title_md = gr.Markdown(get_text("why_title", "en"))
why_desc_md = gr.Markdown(get_text("why_desc", "en"))
# Evaluation Criteria section
with gr.Group(elem_classes="guide-section"):
criteria_title_md = gr.Markdown(get_text("criteria_title", "en"))
criteria_desc_md = gr.Markdown(get_text("criteria_desc", "en"))
# Requirements and Scoring
with gr.Row():
with gr.Column():
with gr.Group(elem_classes="guide-section"):
requirements_md = gr.Markdown(get_text("requirements", "en"))
bonus_md = gr.Markdown(get_text("bonus_system", "en"))
with gr.Column():
with gr.Group(elem_classes="guide-section"):
score_system_md = gr.Markdown(get_text("score_system", "en"))
grade_criteria_md = gr.Markdown(get_text("grade_criteria", "en"))
# Evaluation Scale
with gr.Group(elem_classes="guide-section"):
eval_scale_md = gr.Markdown(get_text("evaluation_scale", "en"))
# Warning
warning_html = gr.HTML(get_text("warning", "en"))
# Quick submit result display (hidden by default)
quick_submit_output = gr.Markdown(visible=False)
# Language change handler
def update_language(lang):
return (
lang, # Update state
get_text("title", lang),
gr.TabItem(label=get_text("leaderboard_tab", lang)),
gr.TabItem(label=get_text("submit_tab", lang)),
gr.TabItem(label=get_text("history_tab", lang)),
gr.TabItem(label=get_text("guide_tab", lang)),
get_text("simple_leaderboard_header", lang),
gr.Button(value=get_text("refresh_btn", lang), variant="secondary"),
gr.File(label=get_text("upload_label", lang)),
gr.Textbox(label=get_text("llm_url_label", lang), placeholder=get_text("llm_url_placeholder", lang)),
gr.Checkbox(label=get_text("is_human_sample_label", lang)),
gr.Button(value=get_text("evaluate_btn", lang), variant="primary", size="lg"),
gr.Markdown(value=get_text("result_label", lang)),
gr.Button(value=get_text("history_btn", lang), variant="secondary"),
load_global_leaderboard(lang),
gr.Button(value=f"📥 {get_text('download_pdf', lang)}", size="sm"),
gr.Button(value=f"👁️ {get_text('view_evaluation', lang)}", size="sm"),
# Guide tab updates
get_text("purpose_title", lang),
get_text("purpose_desc", lang),
get_text("why_title", lang),
get_text("why_desc", lang),
get_text("criteria_title", lang),
get_text("criteria_desc", lang),
get_text("requirements", lang),
get_text("bonus_system", lang),
get_text("score_system", lang),
get_text("grade_criteria", lang),
get_text("evaluation_scale", lang),
get_text("warning", lang)
)
lang_selector.change(
fn=update_language,
inputs=[lang_selector],
outputs=[
current_lang, title_md,
leaderboard_tab, submit_tab, history_tab, guide_tab,
leaderboard_header, refresh_btn,
pdf_input, llm_url_input, is_human_sample_input, evaluate_btn, output, history_btn, leaderboard_display,
manual_download_btn, manual_view_btn,
# Guide tab elements
purpose_title_md, purpose_desc_md, why_title_md, why_desc_md,
criteria_title_md, criteria_desc_md, requirements_md, bonus_md,
score_system_md, grade_criteria_md, eval_scale_md, warning_html
]
)
# Event handlers
evaluate_btn.click(
fn=evaluate_novel,
inputs=[pdf_input, llm_url_input, is_human_sample_input, current_lang],
outputs=[output, history_display, leaderboard_display],
show_progress=True
)
def refresh_history(profile: gr.OAuthProfile = None):
if not profile:
return None
lang = current_lang.value if hasattr(current_lang, 'value') else "en"
df, _ = load_user_evaluations(profile.username, lang)
return df
history_btn.click(
fn=refresh_history,
inputs=[],
outputs=[history_display]
)
refresh_btn.click(
fn=lambda lang: load_global_leaderboard(lang),
inputs=[current_lang],
outputs=[leaderboard_display]
)
# Click handler for dataframe rows
def on_dataframe_select(evt: gr.SelectData, dataframe):
if evt.index and len(evt.index) >= 2:
row_idx = evt.index[0]
col_idx = evt.index[1]
# Get column name
if dataframe is not None and not dataframe.empty:
cols = dataframe.columns.tolist()
if col_idx < len(cols):
col_name = cols[col_idx]
# Check if it's download or view column
if col_name in ["Download", "다운로드", "View", "평가보기"]:
# Get the HTML content
cell_value = dataframe.iloc[row_idx, col_idx]
# Extract username and filename from data attributes
import re
user_match = re.search(r'data-user="([^"]+)"', str(cell_value))
file_match = re.search(r'data-file="([^"]+)"', str(cell_value))
if user_match and file_match:
return user_match.group(1), file_match.group(1)
return "", ""
leaderboard_display.select(
fn=on_dataframe_select,
inputs=[leaderboard_display],
outputs=[action_user, action_file]
)
# Manual download button
def manual_download(user, file):
if user and file:
pdf_path = download_pdf(user, file)
if pdf_path:
return gr.File(value=pdf_path, visible=True)
return gr.File(visible=False)
manual_download_btn.click(
fn=manual_download,
inputs=[action_user, action_file],
outputs=[download_result]
)
# Manual view button
def manual_view(user, file, lang):
if user and file:
evaluation = view_evaluation(user, file, lang)
title = f"## 📋 Evaluation for {file}\n### Author: {user}\n\n"
return gr.Markdown(value=title + evaluation, visible=True)
return gr.Markdown(visible=False)
manual_view_btn.click(
fn=manual_view,
inputs=[action_user, action_file, current_lang],
outputs=[eval_display]
)
# Auto-load leaderboard on page load
demo.load(
fn=lambda: load_global_leaderboard("en"),
inputs=[],
outputs=[leaderboard_display]
)
if __name__ == "__main__":
demo.launch()