Jandayl commited on
Commit
2b60cf4
·
1 Parent(s): 2867a3c

added comments

Browse files
__pycache__/feature_core.cpython-311.pyc CHANGED
Binary files a/__pycache__/feature_core.cpython-311.pyc and b/__pycache__/feature_core.cpython-311.pyc differ
 
feature_core.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import re
2
  import subprocess
3
  import sys
@@ -50,7 +51,7 @@ def load_nlp_model(model_name: str = "tl_calamancy_md-0.2.0"):
50
 
51
  raise RuntimeError("Failed to load CalamanCy model. " + " | ".join(errors))
52
 
53
-
54
  def merge_dash_sentences(doc) -> List:
55
  """Merge sentences split by dash tokens (from hyphenated words)."""
56
  dash_tokens = {"-"}
@@ -72,7 +73,7 @@ def merge_dash_sentences(doc) -> List:
72
  merged.append(sent)
73
  return merged
74
 
75
-
76
  def simple_clean(text: str) -> str:
77
  if not isinstance(text, str):
78
  return ""
@@ -81,7 +82,7 @@ def simple_clean(text: str) -> str:
81
  text = re.sub(r"[^\w\s\-.!?]", "", text) # keep sentence-ending punctuation
82
  return text.strip()
83
 
84
-
85
  def basic_counts(doc, original_text: str) -> Tuple[int, int, List]:
86
  tokens = [t for t in doc if not t.is_punct and not t.is_space]
87
  num_words = len(tokens)
@@ -106,29 +107,31 @@ def mean_lengths(tokens, num_words: int, num_sentences: int):
106
  mean_sentence_length = num_words / num_sentences if num_sentences else 0
107
  return round(mean_word_length, 4), round(mean_sentence_length, 4)
108
 
109
-
110
  def type_token_ratio(tokens, num_words: int):
111
  word_list = [t.text.lower() for t in tokens]
112
  return round(len(set(word_list)) / num_words if num_words else 0, 4)
113
 
114
-
115
  def count_filipino_syllables(word: str) -> int:
116
- word = word.lower()
117
- word = re.sub(r"[^a-z-]", "", word)
 
118
 
119
- parts = word.split("-")
120
- syllables = 0
121
- for part in parts:
122
- groups = re.findall(r"[aeiou]", part)
123
- syllables += len(groups)
124
 
125
- return syllables
 
 
126
 
 
127
 
 
128
  def polysyllabic_count(tokens) -> int:
129
  return sum(1 for t in tokens if count_filipino_syllables(t.text) >= 3)
130
 
131
-
132
  def lexical_density_and_pos(tokens, num_words: int):
133
  content_pos = {"NOUN", "VERB", "ADJ", "ADV"}
134
  content_words = 0
@@ -153,7 +156,7 @@ def lexical_density_and_pos(tokens, num_words: int):
153
 
154
  return round(lexical_density, 4), pos_ratios
155
 
156
-
157
  def foreign_word_density(tokens):
158
  english_ngrams = ["th", "ph", "sh", "ch", "wh", "ck", "qu"]
159
  foreign_letters = ["f", "v", "z", "x", "q", "j", "c"]
@@ -168,7 +171,7 @@ def foreign_word_density(tokens):
168
 
169
  return round(count / len(tokens) if tokens else 0, 4)
170
 
171
-
172
  def detect_svo_vso(doc):
173
  sentences = merge_dash_sentences(doc)
174
  if not sentences:
@@ -195,7 +198,7 @@ def detect_svo_vso(doc):
195
 
196
  return "Unknown"
197
 
198
-
199
  def detect_sentence_type(doc):
200
  tokens = [t for t in doc if not t.is_punct and not t.is_space]
201
 
@@ -214,7 +217,7 @@ def detect_sentence_type(doc):
214
 
215
  return "Simple"
216
 
217
-
218
  def extract_features(text: str, nlp) -> Dict[str, Any]:
219
  if not text or not isinstance(text, str):
220
  return {}
 
1
+ # ONE OF THE CORE PROGRAMS OF THE PROJECT. REFERENCED BY feature_extractor and feature_extractor_web.
2
  import re
3
  import subprocess
4
  import sys
 
51
 
52
  raise RuntimeError("Failed to load CalamanCy model. " + " | ".join(errors))
53
 
54
+ # Merges sentences that contains dashes. Without this function, the model would split the sentence on every dash it encounters which is counterproductive.
55
  def merge_dash_sentences(doc) -> List:
56
  """Merge sentences split by dash tokens (from hyphenated words)."""
57
  dash_tokens = {"-"}
 
73
  merged.append(sent)
74
  return merged
75
 
76
+ # cleans the sentence, avoids misidentifying simple sentences as compound/complex
77
  def simple_clean(text: str) -> str:
78
  if not isinstance(text, str):
79
  return ""
 
82
  text = re.sub(r"[^\w\s\-.!?]", "", text) # keep sentence-ending punctuation
83
  return text.strip()
84
 
85
+ # gets the sentence, word, and token count
86
  def basic_counts(doc, original_text: str) -> Tuple[int, int, List]:
87
  tokens = [t for t in doc if not t.is_punct and not t.is_space]
88
  num_words = len(tokens)
 
107
  mean_sentence_length = num_words / num_sentences if num_sentences else 0
108
  return round(mean_word_length, 4), round(mean_sentence_length, 4)
109
 
110
+ # TTR. measures lexical diversity in a sample. Checks whether the vocabulary is rich or not.
111
  def type_token_ratio(tokens, num_words: int):
112
  word_list = [t.text.lower() for t in tokens]
113
  return round(len(set(word_list)) / num_words if num_words else 0, 4)
114
 
 
115
  def count_filipino_syllables(word: str) -> int:
116
+ """Approximate Filipino syllable count by counting vowel nuclei."""
117
+ if not isinstance(word, str):
118
+ return 0
119
 
120
+ word = re.sub(r"[^a-z-]", "", word.lower())
121
+ if not word:
122
+ return 0
 
 
123
 
124
+ syllables = 0
125
+ for part in filter(None, word.split("-")):
126
+ syllables += len(re.findall(r"[aeiou]", part))
127
 
128
+ return max(syllables, 1)
129
 
130
+ # tags token that contains more than 3 syllables
131
  def polysyllabic_count(tokens) -> int:
132
  return sum(1 for t in tokens if count_filipino_syllables(t.text) >= 3)
133
 
134
+ # Computes lexical density and part-of-speech ratios for the token list.
135
  def lexical_density_and_pos(tokens, num_words: int):
136
  content_pos = {"NOUN", "VERB", "ADJ", "ADV"}
137
  content_words = 0
 
156
 
157
  return round(lexical_density, 4), pos_ratios
158
 
159
+ # identifies foreign words by looking for letters foreign to the Filipino alphabet and computes its density.
160
  def foreign_word_density(tokens):
161
  english_ngrams = ["th", "ph", "sh", "ch", "wh", "ck", "qu"]
162
  foreign_letters = ["f", "v", "z", "x", "q", "j", "c"]
 
171
 
172
  return round(count / len(tokens) if tokens else 0, 4)
173
 
174
+ # checks whether a sentence is a Subject-Verb-Object, or a Verb-Subject-Object
175
  def detect_svo_vso(doc):
176
  sentences = merge_dash_sentences(doc)
177
  if not sentences:
 
198
 
199
  return "Unknown"
200
 
201
+ # detects keyword that identifies subordinate and coordinate clauses. Classifies the sentence based on whichever clause it has.
202
  def detect_sentence_type(doc):
203
  tokens = [t for t in doc if not t.is_punct and not t.is_space]
204
 
 
217
 
218
  return "Simple"
219
 
220
+ # main func
221
  def extract_features(text: str, nlp) -> Dict[str, Any]:
222
  if not text or not isinstance(text, str):
223
  return {}
feature_extractor_web.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import logging
2
  from typing import Any, Dict
3
 
 
1
+ # USED BY THE WEB-APP FOR EXTRACTING FEATURES.
2
+
3
  import logging
4
  from typing import Any, Dict
5
 
models/testing.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import pandas as pd
2
 
3
  df = pd.read_csv("corpus_clean.csv")
 
1
+ # THIS FILE IS NOT USED FOR THE CURRENT SYSTEM. ONLY USED FOR TRAINING EARLY VERSIONS OF THE SYSTEM.
2
  import pandas as pd
3
 
4
  df = pd.read_csv("corpus_clean.csv")
test_api.py DELETED
@@ -1,31 +0,0 @@
1
- import requests
2
- import json
3
-
4
- # Test health endpoint
5
- print("Testing health endpoint...")
6
- response = requests.get('http://localhost:5000/health')
7
- print(f"Health check: {response.json()}")
8
- print()
9
-
10
- # Test prediction
11
- print("Testing prediction...")
12
- test_texts = [
13
- "Ang aso ay tumakbo sa parke.",
14
- "Ang mga mag-aaral ay masigasig na nag-aaral para sa kanilang pagsusulit bukas.",
15
- "Ang komprehensibong pagsusuri ng sosyo-ekonomikong kalagayan ay nagpapakita ng makabuluhang pagbabago."
16
- ]
17
-
18
- for text in test_texts:
19
- response = requests.post(
20
- 'http://localhost:5000/api/predict',
21
- json={'text': text}
22
- )
23
-
24
- if response.status_code == 200:
25
- data = response.json()
26
- print(f"\nText: {text[:50]}...")
27
- print(f"Prediction: {data['prediction']['predicted_class']}")
28
- print(f"Grade: {data['prediction']['grade_level']}")
29
- print(f"Confidence: {max(data['prediction']['confidences'].values()):.3f}")
30
- else:
31
- print(f"Error: {response.json()}")