Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -420,7 +420,7 @@ class FAADocumentChecker(DocumentChecker):
|
|
| 420 |
PREDEFINED_ACRONYMS = {
|
| 421 |
'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
|
| 422 |
'MD', 'MIL', 'MO', 'No.', 'PDF', 'SAE', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
|
| 423 |
-
'WA', 'XX', 'ZIP'
|
| 424 |
}
|
| 425 |
|
| 426 |
# Constructor
|
|
@@ -820,17 +820,18 @@ class FAADocumentChecker(DocumentChecker):
|
|
| 820 |
|
| 821 |
# Check terminology patterns
|
| 822 |
for pattern_config in terminology_patterns:
|
| 823 |
-
|
|
|
|
| 824 |
for match in matches:
|
| 825 |
if pattern_config.replacement: # Only if there's a replacement term
|
| 826 |
unique_issues.add((match.group(), pattern_config.replacement))
|
| 827 |
|
| 828 |
# Check prohibited patterns
|
| 829 |
for pattern_config in prohibited_patterns:
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
|
| 835 |
# Format issues as simple replacement instructions
|
| 836 |
formatted_issues = [
|
|
@@ -1095,10 +1096,28 @@ class FAADocumentChecker(DocumentChecker):
|
|
| 1095 |
|
| 1096 |
incorrect_sentences = []
|
| 1097 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1098 |
for paragraph in doc:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1099 |
# Split the paragraph into sentences based on common sentence-ending punctuation
|
| 1100 |
-
sentences = re.split(r'(?<=[.!?]) +',
|
| 1101 |
for sentence in sentences:
|
|
|
|
|
|
|
| 1102 |
if sentence.endswith('..'):
|
| 1103 |
incorrect_sentences.append({'sentence': sentence.strip()})
|
| 1104 |
|
|
@@ -1516,6 +1535,24 @@ class FAADocumentChecker(DocumentChecker):
|
|
| 1516 |
List of tuples containing (sentence, parent_paragraph)
|
| 1517 |
"""
|
| 1518 |
sentences = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1519 |
for paragraph in doc:
|
| 1520 |
paragraph = paragraph.strip()
|
| 1521 |
|
|
@@ -1527,17 +1564,34 @@ class FAADocumentChecker(DocumentChecker):
|
|
| 1527 |
):
|
| 1528 |
continue
|
| 1529 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1530 |
# Split paragraph into sentences
|
| 1531 |
-
para_sentences = re.split(r'(?<=[.!?])\s+',
|
| 1532 |
|
| 1533 |
# Process each sentence
|
| 1534 |
for sentence in para_sentences:
|
|
|
|
|
|
|
|
|
|
| 1535 |
sentence = sentence.strip()
|
| 1536 |
if skip_empty and not sentence:
|
| 1537 |
continue
|
| 1538 |
sentences.append((sentence, paragraph))
|
| 1539 |
|
| 1540 |
-
return sentences
|
| 1541 |
|
| 1542 |
@profile_performance
|
| 1543 |
def check_parentheses(self, doc: List[str]) -> DocumentCheckResult:
|
|
|
|
| 420 |
PREDEFINED_ACRONYMS = {
|
| 421 |
'AGC', 'AIR', 'CFR', 'DC', 'DOT', 'FAA IR-M', 'FAQ', 'i.e.', 'e.g.', 'MA',
|
| 422 |
'MD', 'MIL', 'MO', 'No.', 'PDF', 'SAE', 'SSN', 'TX', 'U.S.', 'U.S.C.', 'USA', 'US',
|
| 423 |
+
'WA', 'XX', 'ZIP', 'ACO' # Added ACO to ignore in acronym checks
|
| 424 |
}
|
| 425 |
|
| 426 |
# Constructor
|
|
|
|
| 820 |
|
| 821 |
# Check terminology patterns
|
| 822 |
for pattern_config in terminology_patterns:
|
| 823 |
+
compiled_pattern = pattern_config.compile()
|
| 824 |
+
matches = list(compiled_pattern.finditer(sentence))
|
| 825 |
for match in matches:
|
| 826 |
if pattern_config.replacement: # Only if there's a replacement term
|
| 827 |
unique_issues.add((match.group(), pattern_config.replacement))
|
| 828 |
|
| 829 |
# Check prohibited patterns
|
| 830 |
for pattern_config in prohibited_patterns:
|
| 831 |
+
compiled_pattern = pattern_config.compile()
|
| 832 |
+
match = compiled_pattern.search(sentence)
|
| 833 |
+
if match and pattern_config.replacement: # Only if there's a replacement term
|
| 834 |
+
unique_issues.add((match.group(), pattern_config.replacement))
|
| 835 |
|
| 836 |
# Format issues as simple replacement instructions
|
| 837 |
formatted_issues = [
|
|
|
|
| 1096 |
|
| 1097 |
incorrect_sentences = []
|
| 1098 |
|
| 1099 |
+
# Common abbreviations that end with a period but don't end sentences
|
| 1100 |
+
abbreviations = {
|
| 1101 |
+
'U.S.C.', 'U.S.', 'CFR', 'e.g.', 'i.e.', 'etc.', 'vs.', 'Dr.', 'Mr.',
|
| 1102 |
+
'Mrs.', 'Ms.', 'Prof.', 'Ph.D.', 'M.D.', 'B.A.', 'M.A.', 'Ph.D.'
|
| 1103 |
+
}
|
| 1104 |
+
|
| 1105 |
+
# Create a regex pattern that matches these abbreviations
|
| 1106 |
+
abbr_pattern = '|'.join(re.escape(abbr) for abbr in abbreviations)
|
| 1107 |
+
|
| 1108 |
for paragraph in doc:
|
| 1109 |
+
# First, protect abbreviations from being checked
|
| 1110 |
+
protected_paragraph = re.sub(
|
| 1111 |
+
f'({abbr_pattern})',
|
| 1112 |
+
lambda m: m.group(1).replace('.', 'ABBR_DOT'),
|
| 1113 |
+
paragraph
|
| 1114 |
+
)
|
| 1115 |
+
|
| 1116 |
# Split the paragraph into sentences based on common sentence-ending punctuation
|
| 1117 |
+
sentences = re.split(r'(?<=[.!?]) +', protected_paragraph)
|
| 1118 |
for sentence in sentences:
|
| 1119 |
+
# Restore the periods in abbreviations
|
| 1120 |
+
sentence = sentence.replace('ABBR_DOT', '.')
|
| 1121 |
if sentence.endswith('..'):
|
| 1122 |
incorrect_sentences.append({'sentence': sentence.strip()})
|
| 1123 |
|
|
|
|
| 1535 |
List of tuples containing (sentence, parent_paragraph)
|
| 1536 |
"""
|
| 1537 |
sentences = []
|
| 1538 |
+
|
| 1539 |
+
# Common abbreviations that end with a period but don't end sentences
|
| 1540 |
+
abbreviations = {
|
| 1541 |
+
'U.S.C.', 'U.S.', 'CFR', 'e.g.', 'i.e.', 'etc.', 'vs.', 'Dr.', 'Mr.',
|
| 1542 |
+
'Mrs.', 'Ms.', 'Prof.', 'Ph.D.', 'M.D.', 'B.A.', 'M.A.', 'Ph.D.'
|
| 1543 |
+
}
|
| 1544 |
+
|
| 1545 |
+
# Legal citation patterns that shouldn't be split
|
| 1546 |
+
legal_citations = [
|
| 1547 |
+
r'\d+ U\.S\.C\. § \d+\([a-zA-Z0-9]*\)(?:\([a-zA-Z0-9]*\))?', # e.g., 5 U.S.C. § 533(a)(1)
|
| 1548 |
+
r'\d+ CFR § \d+\.\d+', # e.g., 14 CFR § 1.1
|
| 1549 |
+
r'\d+ CFR part \d+' # e.g., 14 CFR part 1
|
| 1550 |
+
]
|
| 1551 |
+
|
| 1552 |
+
# Create a regex pattern that matches these abbreviations
|
| 1553 |
+
abbr_pattern = '|'.join(re.escape(abbr) for abbr in abbreviations)
|
| 1554 |
+
legal_pattern = '|'.join(legal_citations)
|
| 1555 |
+
|
| 1556 |
for paragraph in doc:
|
| 1557 |
paragraph = paragraph.strip()
|
| 1558 |
|
|
|
|
| 1564 |
):
|
| 1565 |
continue
|
| 1566 |
|
| 1567 |
+
# First, protect legal citations from being split
|
| 1568 |
+
protected_paragraph = re.sub(
|
| 1569 |
+
f'({legal_pattern})',
|
| 1570 |
+
lambda m: m.group(1).replace('.', 'LEGAL_DOT'),
|
| 1571 |
+
paragraph
|
| 1572 |
+
)
|
| 1573 |
+
|
| 1574 |
+
# Then protect abbreviations from being split
|
| 1575 |
+
protected_paragraph = re.sub(
|
| 1576 |
+
f'({abbr_pattern})',
|
| 1577 |
+
lambda m: m.group(1).replace('.', 'ABBR_DOT'),
|
| 1578 |
+
protected_paragraph
|
| 1579 |
+
)
|
| 1580 |
+
|
| 1581 |
# Split paragraph into sentences
|
| 1582 |
+
para_sentences = re.split(r'(?<=[.!?])\s+', protected_paragraph)
|
| 1583 |
|
| 1584 |
# Process each sentence
|
| 1585 |
for sentence in para_sentences:
|
| 1586 |
+
# Restore the periods in legal citations and abbreviations
|
| 1587 |
+
sentence = sentence.replace('LEGAL_DOT', '.')
|
| 1588 |
+
sentence = sentence.replace('ABBR_DOT', '.')
|
| 1589 |
sentence = sentence.strip()
|
| 1590 |
if skip_empty and not sentence:
|
| 1591 |
continue
|
| 1592 |
sentences.append((sentence, paragraph))
|
| 1593 |
|
| 1594 |
+
return sentences
|
| 1595 |
|
| 1596 |
@profile_performance
|
| 1597 |
def check_parentheses(self, doc: List[str]) -> DocumentCheckResult:
|