Update README.md
Browse files
README.md
CHANGED
|
@@ -25,7 +25,6 @@ tokenizer = AutoTokenizer.from_pretrained("Kevintu/Engessay_grading_ML")
|
|
| 25 |
|
| 26 |
new_text = "The English Language Learner Insight, Proficiency and Skills Evaluation (ELLIPSE) Corpus is a freely available corpus of ~6,500 ELL writing samples that have been scored for overall holistic language proficiency as well as analytic proficiency scores related to cohesion, syntax, vocabulary, phraseology, grammar, and conventions. In addition, the ELLIPSE corpus provides individual and demographic information for the ELL writers in the corpus including economic status, gender, grade level (8-12), and race/ethnicity. The corpus provides language proficiency scores for individual writers and was developed to advance research in corpus and NLP approaches to assess overall and more fine-grained features of proficiency."
|
| 27 |
|
| 28 |
-
|
| 29 |
# Define the path to your text file
|
| 30 |
#file_path = 'path/to/yourfile.txt'
|
| 31 |
|
|
@@ -33,22 +32,17 @@ new_text = "The English Language Learner Insight, Proficiency and Skills Evaluat
|
|
| 33 |
#with open(file_path, 'r', encoding='utf-8') as file:
|
| 34 |
# new_text = file.read()
|
| 35 |
|
| 36 |
-
|
| 37 |
encoded_input = tokenizer(new_text, return_tensors='pt', padding=True, truncation=True, max_length=64)
|
| 38 |
-
|
| 39 |
model.eval()
|
| 40 |
|
| 41 |
# Perform the prediction
|
| 42 |
with torch.no_grad():
|
| 43 |
outputs = model(**encoded_input)
|
| 44 |
|
| 45 |
-
# Get the predictions (the output here depends on whether you are doing regression or classification)
|
| 46 |
predictions = outputs.logits.squeeze()
|
| 47 |
|
| 48 |
-
|
| 49 |
predicted_scores = predictions.numpy()
|
| 50 |
item_names = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
|
| 51 |
-
|
| 52 |
for item, score in zip(item_names, predicted_scores):
|
| 53 |
print(f"{item}: {score:.4f}")
|
| 54 |
|
|
@@ -68,35 +62,26 @@ for item, score in zip(item_names, predicted_scores):
|
|
| 68 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
| 69 |
import torch
|
| 70 |
|
| 71 |
-
|
| 72 |
model = AutoModelForSequenceClassification.from_pretrained("Kevintu/Engessay_grading_ML")
|
| 73 |
tokenizer = AutoTokenizer.from_pretrained("Kevintu/Engessay_grading_ML")
|
| 74 |
|
| 75 |
-
|
| 76 |
new_text = "The English Language Learner Insight, Proficiency and Skills Evaluation (ELLIPSE) Corpus is a freely available corpus of ~6,500 ELL writing samples that have been scored for overall holistic language proficiency as well as analytic proficiency scores related to cohesion, syntax, vocabulary, phraseology, grammar, and conventions. In addition, the ELLIPSE corpus provides individual and demographic information for the ELL writers in the corpus including economic status, gender, grade level (8-12), and race/ethnicity. The corpus provides language proficiency scores for individual writers and was developed to advance research in corpus and NLP approaches to assess overall and more fine-grained features of proficiency."
|
| 77 |
-
|
| 78 |
encoded_input = tokenizer(new_text, return_tensors='pt', padding=True, truncation=True, max_length=64)
|
| 79 |
|
| 80 |
-
|
| 81 |
model.eval()
|
| 82 |
with torch.no_grad():
|
| 83 |
outputs = model(**encoded_input)
|
| 84 |
|
| 85 |
-
|
| 86 |
predictions = outputs.logits.squeeze()
|
| 87 |
-
|
| 88 |
predicted_scores = predictions.numpy() # Convert to numpy array
|
| 89 |
item_names = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
|
| 90 |
|
| 91 |
# Scale predictions from 1 to 10
|
| 92 |
scaled_scores = 2.25 * predicted_scores - 1.25
|
| 93 |
-
|
| 94 |
-
|
| 95 |
for item, score in zip(item_names, scaled_scores):
|
| 96 |
print(f"{trait}: {score:.4f}")
|
| 97 |
|
| 98 |
##"ouput" (values between 1-10)
|
| 99 |
-
|
| 100 |
#cohesion: 6.7147
|
| 101 |
#syntax: 6.9354
|
| 102 |
#vocabulary: 7.5814
|
|
|
|
| 25 |
|
| 26 |
new_text = "The English Language Learner Insight, Proficiency and Skills Evaluation (ELLIPSE) Corpus is a freely available corpus of ~6,500 ELL writing samples that have been scored for overall holistic language proficiency as well as analytic proficiency scores related to cohesion, syntax, vocabulary, phraseology, grammar, and conventions. In addition, the ELLIPSE corpus provides individual and demographic information for the ELL writers in the corpus including economic status, gender, grade level (8-12), and race/ethnicity. The corpus provides language proficiency scores for individual writers and was developed to advance research in corpus and NLP approaches to assess overall and more fine-grained features of proficiency."
|
| 27 |
|
|
|
|
| 28 |
# Define the path to your text file
|
| 29 |
#file_path = 'path/to/yourfile.txt'
|
| 30 |
|
|
|
|
| 32 |
#with open(file_path, 'r', encoding='utf-8') as file:
|
| 33 |
# new_text = file.read()
|
| 34 |
|
|
|
|
| 35 |
encoded_input = tokenizer(new_text, return_tensors='pt', padding=True, truncation=True, max_length=64)
|
|
|
|
| 36 |
model.eval()
|
| 37 |
|
| 38 |
# Perform the prediction
|
| 39 |
with torch.no_grad():
|
| 40 |
outputs = model(**encoded_input)
|
| 41 |
|
|
|
|
| 42 |
predictions = outputs.logits.squeeze()
|
| 43 |
|
|
|
|
| 44 |
predicted_scores = predictions.numpy()
|
| 45 |
item_names = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
|
|
|
|
| 46 |
for item, score in zip(item_names, predicted_scores):
|
| 47 |
print(f"{item}: {score:.4f}")
|
| 48 |
|
|
|
|
| 62 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
| 63 |
import torch
|
| 64 |
|
|
|
|
| 65 |
model = AutoModelForSequenceClassification.from_pretrained("Kevintu/Engessay_grading_ML")
|
| 66 |
tokenizer = AutoTokenizer.from_pretrained("Kevintu/Engessay_grading_ML")
|
| 67 |
|
|
|
|
| 68 |
new_text = "The English Language Learner Insight, Proficiency and Skills Evaluation (ELLIPSE) Corpus is a freely available corpus of ~6,500 ELL writing samples that have been scored for overall holistic language proficiency as well as analytic proficiency scores related to cohesion, syntax, vocabulary, phraseology, grammar, and conventions. In addition, the ELLIPSE corpus provides individual and demographic information for the ELL writers in the corpus including economic status, gender, grade level (8-12), and race/ethnicity. The corpus provides language proficiency scores for individual writers and was developed to advance research in corpus and NLP approaches to assess overall and more fine-grained features of proficiency."
|
|
|
|
| 69 |
encoded_input = tokenizer(new_text, return_tensors='pt', padding=True, truncation=True, max_length=64)
|
| 70 |
|
|
|
|
| 71 |
model.eval()
|
| 72 |
with torch.no_grad():
|
| 73 |
outputs = model(**encoded_input)
|
| 74 |
|
|
|
|
| 75 |
predictions = outputs.logits.squeeze()
|
|
|
|
| 76 |
predicted_scores = predictions.numpy() # Convert to numpy array
|
| 77 |
item_names = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
|
| 78 |
|
| 79 |
# Scale predictions from 1 to 10
|
| 80 |
scaled_scores = 2.25 * predicted_scores - 1.25
|
|
|
|
|
|
|
| 81 |
for item, score in zip(item_names, scaled_scores):
|
| 82 |
print(f"{trait}: {score:.4f}")
|
| 83 |
|
| 84 |
##"ouput" (values between 1-10)
|
|
|
|
| 85 |
#cohesion: 6.7147
|
| 86 |
#syntax: 6.9354
|
| 87 |
#vocabulary: 7.5814
|