Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -115,12 +115,22 @@ def normalise_hyphens(text):
|
|
| 115 |
# Replace hyphen variants with U+002D for internal consistency
|
| 116 |
return text.replace('\u2011', '-').replace('\u2013', '-').replace('\u2014', '-')
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
def encode_text_fragment(text):
|
|
|
|
|
|
|
| 119 |
# Encode text for W3C Text Fragments, preserving only regular hyphens (U+002D)
|
| 120 |
-
|
| 121 |
-
# En dashes (U+2013) are encoded as %E2%80%93
|
| 122 |
-
# Em dashes (U+2014) are encoded as %E2%80%94
|
| 123 |
-
return urllib.parse.quote(text, safe='-')
|
| 124 |
|
| 125 |
def generate_citation_hash(author, year, url, fragment_text, cited_text, username, task_name, current_date, current_time):
|
| 126 |
# Normalise hyphens for consistent hash generation
|
|
@@ -131,7 +141,7 @@ def generate_citation_hash(author, year, url, fragment_text, cited_text, usernam
|
|
| 131 |
return hashlib.sha256(data.encode('utf-8')).hexdigest()
|
| 132 |
|
| 133 |
def format_citation_html(url, fragment_text, author, year, scc_hash):
|
| 134 |
-
# Use
|
| 135 |
encoded_fragment = encode_text_fragment(fragment_text)
|
| 136 |
full_url = f"{url}#:~:text={encoded_fragment}"
|
| 137 |
return f'<a href="{full_url}" data-hash="{scc_hash}">{author} ({year})</a>'
|
|
|
|
| 115 |
# Replace hyphen variants with U+002D for internal consistency
|
| 116 |
return text.replace('\u2011', '-').replace('\u2013', '-').replace('\u2014', '-')
|
| 117 |
|
| 118 |
+
def get_longest_segment(text):
|
| 119 |
+
# Split text by various dash types (hyphen, en dash, em dash, non-breaking hyphen)
|
| 120 |
+
dash_pattern = r'[\u002D\u2011\u2013\u2014]'
|
| 121 |
+
segments = re.split(dash_pattern, text)
|
| 122 |
+
# Remove empty segments and strip whitespace
|
| 123 |
+
segments = [segment.strip() for segment in segments if segment.strip()]
|
| 124 |
+
if not segments:
|
| 125 |
+
return text # Return original text if no valid segments
|
| 126 |
+
# Return the longest segment
|
| 127 |
+
return max(segments, key=len)
|
| 128 |
+
|
| 129 |
def encode_text_fragment(text):
|
| 130 |
+
# Get the longest segment if text contains dashes
|
| 131 |
+
fragment_text = get_longest_segment(text)
|
| 132 |
# Encode text for W3C Text Fragments, preserving only regular hyphens (U+002D)
|
| 133 |
+
return urllib.parse.quote(fragment_text, safe='-')
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
def generate_citation_hash(author, year, url, fragment_text, cited_text, username, task_name, current_date, current_time):
|
| 136 |
# Normalise hyphens for consistent hash generation
|
|
|
|
| 141 |
return hashlib.sha256(data.encode('utf-8')).hexdigest()
|
| 142 |
|
| 143 |
def format_citation_html(url, fragment_text, author, year, scc_hash):
|
| 144 |
+
# Use the longest segment for the text fragment URL
|
| 145 |
encoded_fragment = encode_text_fragment(fragment_text)
|
| 146 |
full_url = f"{url}#:~:text={encoded_fragment}"
|
| 147 |
return f'<a href="{full_url}" data-hash="{scc_hash}">{author} ({year})</a>'
|