Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -354,6 +354,16 @@ class DocumentCheckerConfig:
|
|
| 354 |
description="Ignore 'title 14, Code of Federal Regulations (14 CFR)'",
|
| 355 |
is_error=False
|
| 356 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
PatternConfig(
|
| 358 |
pattern=r'\bAD Compliance Team \(AD CRT\)\b',
|
| 359 |
description="Ignore 'AD Compliance Team (AD CRT)'",
|
|
@@ -658,7 +668,7 @@ class FAADocumentChecker(DocumentChecker):
|
|
| 658 |
PREDEFINED_ACRONYMS = {
|
| 659 |
'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
|
| 660 |
'MD', 'MIL', 'MO', 'No.', 'PDF', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
|
| 661 |
-
'WA', 'ZIP'
|
| 662 |
}
|
| 663 |
|
| 664 |
# Constructor
|
|
@@ -1078,48 +1088,47 @@ class FAADocumentChecker(DocumentChecker):
|
|
| 1078 |
if not self.validate_input(doc):
|
| 1079 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
| 1080 |
|
| 1081 |
-
section_patterns = self.config_manager.pattern_registry.get('section_symbol', [])
|
| 1082 |
issues = []
|
| 1083 |
-
|
| 1084 |
for paragraph in doc:
|
| 1085 |
sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip())
|
| 1086 |
|
| 1087 |
for sentence in sentences:
|
| 1088 |
sentence = sentence.strip()
|
| 1089 |
-
|
| 1090 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1091 |
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
-
|
| 1095 |
-
issues.append({
|
| 1096 |
-
'incorrect': section_ref,
|
| 1097 |
-
'correct': f"Section {section_ref.lstrip('§')}",
|
| 1098 |
-
'is_sentence_start': True # Flag to indicate sentence start issue
|
| 1099 |
-
})
|
| 1100 |
|
| 1101 |
-
|
| 1102 |
-
|
| 1103 |
-
|
| 1104 |
-
incorrect = match.group()
|
| 1105 |
-
# Remove § symbol without adding 'Section'
|
| 1106 |
-
correct = incorrect.replace('§ ', '')
|
| 1107 |
-
issues.append({
|
| 1108 |
-
'incorrect': incorrect,
|
| 1109 |
-
'correct': correct
|
| 1110 |
-
})
|
| 1111 |
|
| 1112 |
-
|
| 1113 |
-
|
| 1114 |
-
|
| 1115 |
-
|
| 1116 |
-
|
| 1117 |
-
|
| 1118 |
-
|
| 1119 |
-
|
| 1120 |
-
}
|
|
|
|
| 1121 |
|
| 1122 |
-
return DocumentCheckResult(success=
|
| 1123 |
|
| 1124 |
@profile_performance
|
| 1125 |
def caption_check(self, doc: List[str], doc_type: str, caption_type: str) -> DocumentCheckResult:
|
|
|
|
| 354 |
description="Ignore 'title 14, Code of Federal Regulations (14 CFR)'",
|
| 355 |
is_error=False
|
| 356 |
),
|
| 357 |
+
PatternConfig(
|
| 358 |
+
pattern=r'\btitle 49 of the United States Code \(49 U.S.C.\)\b',
|
| 359 |
+
description="Ignore 'title 49 of the United States Code (49 U.S.C.)'",
|
| 360 |
+
is_error=False
|
| 361 |
+
),
|
| 362 |
+
PatternConfig(
|
| 363 |
+
pattern=r'\btitle 49, United States Code \(49 U.S.C.\)\b',
|
| 364 |
+
description="Ignore 'title 49, United States Code (49 U.S.C.)'",
|
| 365 |
+
is_error=False
|
| 366 |
+
),
|
| 367 |
PatternConfig(
|
| 368 |
pattern=r'\bAD Compliance Team \(AD CRT\)\b',
|
| 369 |
description="Ignore 'AD Compliance Team (AD CRT)'",
|
|
|
|
| 668 |
PREDEFINED_ACRONYMS = {
|
| 669 |
'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
|
| 670 |
'MD', 'MIL', 'MO', 'No.', 'PDF', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
|
| 671 |
+
'WA', 'XX', 'ZIP'
|
| 672 |
}
|
| 673 |
|
| 674 |
# Constructor
|
|
|
|
| 1088 |
if not self.validate_input(doc):
|
| 1089 |
return DocumentCheckResult(success=False, issues=[{'error': 'Invalid document input'}])
|
| 1090 |
|
|
|
|
| 1091 |
issues = []
|
| 1092 |
+
|
| 1093 |
for paragraph in doc:
|
| 1094 |
sentences = re.split(r'(?<=[.!?])\s+', paragraph.strip())
|
| 1095 |
|
| 1096 |
for sentence in sentences:
|
| 1097 |
sentence = sentence.strip()
|
| 1098 |
+
|
| 1099 |
+
# Check 14 CFR citations only
|
| 1100 |
+
cfr_matches = re.finditer(r'\b14 CFR §\s*(\d+\.\d+)\b', sentence)
|
| 1101 |
+
for match in cfr_matches:
|
| 1102 |
+
# Skip if this is part of a U.S.C. citation
|
| 1103 |
+
if not re.search(r'U\.S\.C\.\s*§', sentence):
|
| 1104 |
+
full_match = match.group(0)
|
| 1105 |
+
section_num = match.group(1)
|
| 1106 |
+
issues.append({
|
| 1107 |
+
'incorrect': full_match,
|
| 1108 |
+
'correct': f'14 CFR {section_num}',
|
| 1109 |
+
'description': f"Replace '{full_match}' with '14 CFR {section_num}'"
|
| 1110 |
+
})
|
| 1111 |
|
| 1112 |
+
# Skip any checks for sections that are part of U.S.C. citations
|
| 1113 |
+
if re.search(r'U\.S\.C\.\s*(?:§|§§)', sentence):
|
| 1114 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1115 |
|
| 1116 |
+
# Skip any checks for sections that are part of 14 CFR citations
|
| 1117 |
+
if re.search(r'14 CFR\s*§', sentence):
|
| 1118 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1119 |
|
| 1120 |
+
# Check section symbol at start of sentence
|
| 1121 |
+
if sentence.startswith('§'):
|
| 1122 |
+
match = re.match(r'^§\s*(\d+(?:\.\d+)?)', sentence)
|
| 1123 |
+
if match:
|
| 1124 |
+
section_num = match.group(1)
|
| 1125 |
+
issues.append({
|
| 1126 |
+
'incorrect': f'§ {section_num}',
|
| 1127 |
+
'correct': f'Section {section_num}',
|
| 1128 |
+
'description': f"Replace '§ {section_num}' with 'Section {section_num}'"
|
| 1129 |
+
})
|
| 1130 |
|
| 1131 |
+
return DocumentCheckResult(success=len(issues) == 0, issues=issues)
|
| 1132 |
|
| 1133 |
@profile_performance
|
| 1134 |
def caption_check(self, doc: List[str], doc_type: str, caption_type: str) -> DocumentCheckResult:
|