Spaces:
Sleeping
Sleeping
Gül Sena Altıntaş
commited on
Commit
·
279fdab
1
Parent(s):
93f64e6
Fixed Farsi copy-paste error, Coding [WIP]
Browse files
app.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
|
|
| 1 |
import gc
|
|
|
|
| 2 |
import logging
|
| 3 |
import os
|
| 4 |
import re
|
|
@@ -55,6 +57,7 @@ PREDEFINED_MODELS = [
|
|
| 55 |
"google/byt5-small",
|
| 56 |
"gsaltintas/supertoken_models-llama_gpt2",
|
| 57 |
"gsaltintas/supertoken_models-llama_google-gemma-2-2b",
|
|
|
|
| 58 |
]
|
| 59 |
# Global cache for loaded models
|
| 60 |
model_cache = {}
|
|
@@ -62,25 +65,93 @@ model_cache = {}
|
|
| 62 |
|
| 63 |
def parse_dataset(text):
|
| 64 |
"""Parse the input dataset text into structured questions"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
if not text.strip():
|
| 66 |
return [], "Please enter your dataset"
|
| 67 |
|
| 68 |
-
|
|
|
|
| 69 |
|
| 70 |
-
#
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
questions = []
|
| 75 |
errors = []
|
| 76 |
|
| 77 |
-
for i,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
# for i, line in enumerate(lines[1:], 2): # Start from line 2 (after header)
|
| 79 |
line = line.strip()
|
| 80 |
if not line:
|
| 81 |
continue
|
| 82 |
|
| 83 |
-
parts = [part
|
| 84 |
|
| 85 |
if len(parts) < 5:
|
| 86 |
errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
|
|
@@ -522,6 +593,23 @@ def create_summary_markdown(summary_stats):
|
|
| 522 |
return "\n".join(lines)
|
| 523 |
|
| 524 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 525 |
def create_detailed_results_html(questions, results):
|
| 526 |
"""Create detailed HTML results for each question"""
|
| 527 |
if not questions or not results:
|
|
@@ -614,6 +702,7 @@ def create_detailed_results_html(questions, results):
|
|
| 614 |
opacity: 0.7;
|
| 615 |
font-family: monospace;
|
| 616 |
}
|
|
|
|
| 617 |
</style>
|
| 618 |
"""
|
| 619 |
]
|
|
@@ -917,6 +1006,33 @@ css = """
|
|
| 917 |
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
| 918 |
font-size: 12px;
|
| 919 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 920 |
"""
|
| 921 |
|
| 922 |
# Create Gradio interface
|
|
@@ -958,6 +1074,7 @@ What is 2+2?,4,3,2,5
|
|
| 958 |
What is the capital of France?,Paris,London,Berlin,Paris""",
|
| 959 |
lines=8,
|
| 960 |
max_lines=15,
|
|
|
|
| 961 |
)
|
| 962 |
|
| 963 |
gr.Markdown("""
|
|
|
|
| 1 |
+
import csv
|
| 2 |
import gc
|
| 3 |
+
import io
|
| 4 |
import logging
|
| 5 |
import os
|
| 6 |
import re
|
|
|
|
| 57 |
"google/byt5-small",
|
| 58 |
"gsaltintas/supertoken_models-llama_gpt2",
|
| 59 |
"gsaltintas/supertoken_models-llama_google-gemma-2-2b",
|
| 60 |
+
"gsaltintas/supertoken_models-llama_google-gemma-2-2b-100b",
|
| 61 |
]
|
| 62 |
# Global cache for loaded models
|
| 63 |
model_cache = {}
|
|
|
|
| 65 |
|
| 66 |
def parse_dataset(text):
|
| 67 |
"""Parse the input dataset text into structured questions"""
|
| 68 |
+
|
| 69 |
+
def clean_cell(s: str) -> str:
|
| 70 |
+
return s.strip().replace("\r", "").replace("\n", " ").strip('"').strip()
|
| 71 |
+
|
| 72 |
if not text.strip():
|
| 73 |
return [], "Please enter your dataset"
|
| 74 |
|
| 75 |
+
# Normalize line endings
|
| 76 |
+
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
| 77 |
|
| 78 |
+
# Detect delimiter from first non-empty line
|
| 79 |
+
for line in text.splitlines():
|
| 80 |
+
if line.strip():
|
| 81 |
+
delimiter = "\t" if "\t" in line else ","
|
| 82 |
+
break
|
| 83 |
+
|
| 84 |
+
# Use csv.reader to handle quoted multi-line cells
|
| 85 |
+
reader = csv.reader(io.StringIO(text), delimiter=delimiter, quotechar='"')
|
| 86 |
|
| 87 |
questions = []
|
| 88 |
errors = []
|
| 89 |
|
| 90 |
+
for i, row in enumerate(reader, 1):
|
| 91 |
+
# skip empty rows
|
| 92 |
+
if not any(cell.strip() for cell in row):
|
| 93 |
+
continue
|
| 94 |
+
|
| 95 |
+
parts = [clean_cell(p) for p in row]
|
| 96 |
+
if len(parts) < 5:
|
| 97 |
+
errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
|
| 98 |
+
continue
|
| 99 |
+
|
| 100 |
+
question = {
|
| 101 |
+
"question": parts[0],
|
| 102 |
+
"correct_answer": parts[1],
|
| 103 |
+
"choices": [parts[2], parts[3], parts[4]],
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
if question["correct_answer"] not in question["choices"]:
|
| 107 |
+
question["choices"].append(question["correct_answer"])
|
| 108 |
+
|
| 109 |
+
questions.append(question)
|
| 110 |
+
|
| 111 |
+
error_msg = "\n".join(errors) if errors else ""
|
| 112 |
+
return questions, error_msg
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def parse_datasetold(text):
|
| 116 |
+
"""Parse the input dataset text into structured questions"""
|
| 117 |
+
if not text.strip():
|
| 118 |
+
return [], "Please enter your dataset"
|
| 119 |
+
|
| 120 |
+
# Detect delimiter
|
| 121 |
+
sample_line = text.splitlines()[0]
|
| 122 |
+
delimiter = "\t" if "\t" in sample_line else ","
|
| 123 |
+
|
| 124 |
+
# Use csv.reader to correctly parse quotes & newlines
|
| 125 |
+
reader = csv.reader(io.StringIO(text), delimiter=delimiter)
|
| 126 |
+
|
| 127 |
+
questions = []
|
| 128 |
+
errors = []
|
| 129 |
+
for i, row in enumerate(reader, 1):
|
| 130 |
+
parts = [clean_cell(p) for p in row if p.strip()]
|
| 131 |
+
if len(parts) < 5:
|
| 132 |
+
errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
|
| 133 |
+
continue
|
| 134 |
+
|
| 135 |
+
question = {
|
| 136 |
+
"question": parts[0],
|
| 137 |
+
"correct_answer": parts[1],
|
| 138 |
+
"choices": [parts[2], parts[3], parts[4]],
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
if question["correct_answer"] not in question["choices"]:
|
| 142 |
+
question["choices"].append(question["correct_answer"])
|
| 143 |
+
|
| 144 |
+
questions.append(question)
|
| 145 |
+
|
| 146 |
+
error_msg = "\n".join(errors) if errors else ""
|
| 147 |
+
return questions, error_msg
|
| 148 |
+
for i, line in enumerate(reader, 1):
|
| 149 |
# for i, line in enumerate(lines[1:], 2): # Start from line 2 (after header)
|
| 150 |
line = line.strip()
|
| 151 |
if not line:
|
| 152 |
continue
|
| 153 |
|
| 154 |
+
parts = [clean_text(part) for part in line.split(delimiter)]
|
| 155 |
|
| 156 |
if len(parts) < 5:
|
| 157 |
errors.append(f"Line {i}: Not enough columns (need 5, got {len(parts)})")
|
|
|
|
| 593 |
return "\n".join(lines)
|
| 594 |
|
| 595 |
|
| 596 |
+
# CSS for universal text handling
|
| 597 |
+
universal_css = """
|
| 598 |
+
.universal-text textarea {
|
| 599 |
+
direction: auto !important;
|
| 600 |
+
text-align: start !important;
|
| 601 |
+
unicode-bidi: plaintext !important;
|
| 602 |
+
font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI',
|
| 603 |
+
'Roboto', 'Arial', 'Noto Sans', sans-serif !important;
|
| 604 |
+
}
|
| 605 |
+
|
| 606 |
+
/* Better handling for mixed content */
|
| 607 |
+
.universal-text textarea:focus {
|
| 608 |
+
unicode-bidi: plaintext !important;
|
| 609 |
+
}
|
| 610 |
+
"""
|
| 611 |
+
|
| 612 |
+
|
| 613 |
def create_detailed_results_html(questions, results):
|
| 614 |
"""Create detailed HTML results for each question"""
|
| 615 |
if not questions or not results:
|
|
|
|
| 702 |
opacity: 0.7;
|
| 703 |
font-family: monospace;
|
| 704 |
}
|
| 705 |
+
|
| 706 |
</style>
|
| 707 |
"""
|
| 708 |
]
|
|
|
|
| 1006 |
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
| 1007 |
font-size: 12px;
|
| 1008 |
}
|
| 1009 |
+
|
| 1010 |
+
.universal-text textarea {
|
| 1011 |
+
direction: ltr !important;
|
| 1012 |
+
text-align: left !important;
|
| 1013 |
+
unicode-bidi: bidi-override !important;
|
| 1014 |
+
font-family: 'Courier New', monospace !important;
|
| 1015 |
+
white-space: pre !important;
|
| 1016 |
+
}
|
| 1017 |
+
|
| 1018 |
+
/* Reset direction after paste */
|
| 1019 |
+
.universal-text textarea:focus {
|
| 1020 |
+
direction: auto !important;
|
| 1021 |
+
unicode-bidi: plaintext !important;
|
| 1022 |
+
}
|
| 1023 |
+
|
| 1024 |
+
# .universal-text textarea {
|
| 1025 |
+
# direction: auto !important;
|
| 1026 |
+
# text-align: start !important;
|
| 1027 |
+
# unicode-bidi: plaintext !important;
|
| 1028 |
+
# font-family: system-ui, -apple-system, BlinkMacSystemFont, 'Segoe UI',
|
| 1029 |
+
# 'Roboto', 'Arial', 'Noto Sans', sans-serif !important;
|
| 1030 |
+
# }
|
| 1031 |
+
|
| 1032 |
+
# /* Better handling for mixed content */
|
| 1033 |
+
# .universal-text textarea:focus {
|
| 1034 |
+
# unicode-bidi: plaintext !important;
|
| 1035 |
+
# }
|
| 1036 |
"""
|
| 1037 |
|
| 1038 |
# Create Gradio interface
|
|
|
|
| 1074 |
What is the capital of France?,Paris,London,Berlin,Paris""",
|
| 1075 |
lines=8,
|
| 1076 |
max_lines=15,
|
| 1077 |
+
elem_classes=["universal-text"],
|
| 1078 |
)
|
| 1079 |
|
| 1080 |
gr.Markdown("""
|