| import Foundation |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| enum PauseMask { |
|
|
| |
| private static let floorClause = 0.25 |
| private static let floorConnector = 0.05 |
| private static let floorHanzi = 5e-3 |
| private static let floorSpace = 1e-4 |
| private static let cjkSentenceFloor = 0.9 |
| private static let forbid = 1e-9 |
|
|
| |
| |
| static func apply(_ probs: [Double], scalars: [Unicode.Scalar]) -> [Double] { |
| let n = scalars.count |
| guard n > 1 else { return probs } |
| var p = probs |
| let connectors = connectorBreakPositions(scalars) |
|
|
| |
| for i in 0..<(n - 1) { |
| let ch = scalars[i] |
| let nxt = scalars[i + 1] |
| let endsToken = CharClasses.isSpace(nxt) || CharClasses.isCJK(nxt) |
|
|
| if CharClasses.clausePunct.contains(ch) && endsToken { |
| p[i] = max(p[i], floorClause) |
| } else if CharClasses.cjkSentencePunct.contains(ch) { |
| p[i] = max(p[i], cjkSentenceFloor) |
| } else if connectors.contains(i) { |
| p[i] = max(p[i], floorConnector) |
| } else if CharClasses.isSpace(nxt) || CharClasses.isSpace(ch) { |
| p[i] = max(p[i], floorSpace) |
| } else if CharClasses.isCJK(ch) && CharClasses.isCJK(nxt) { |
| p[i] = max(p[i], floorHanzi) |
| } else { |
| p[i] = min(p[i], forbid) |
| } |
| } |
| return p |
| } |
|
|
| |
| |
| |
| |
| |
| private static func connectorBreakPositions(_ scalars: [Unicode.Scalar]) -> Set<Int> { |
| var positions = Set<Int>() |
| let n = scalars.count |
| var i = 0 |
| while i < n { |
| guard CharClasses.isSpace(scalars[i]) else { i += 1; continue } |
| let runStart = i |
| while i < n && CharClasses.isSpace(scalars[i]) { i += 1 } |
| |
| let wordStart = i |
| while i < n && !CharClasses.isSpace(scalars[i]) { i += 1 } |
| guard wordStart < i, runStart - 1 >= 0 else { continue } |
|
|
| let word = stripAsciiPunctuation(scalars[wordStart..<i]).lowercased() |
| if CharClasses.connectors.contains(word) { |
| positions.insert(runStart - 1) |
| } |
| } |
| return positions |
| } |
|
|
| |
| private static func stripAsciiPunctuation(_ slice: ArraySlice<Unicode.Scalar>) -> String { |
| var lo = slice.startIndex |
| var hi = slice.endIndex |
| while lo < hi && CharClasses.asciiPunctuation.contains(slice[lo]) { lo += 1 } |
| while hi > lo && CharClasses.asciiPunctuation.contains(slice[hi - 1]) { hi -= 1 } |
| var view = String.UnicodeScalarView() |
| view.append(contentsOf: slice[lo..<hi]) |
| return String(view) |
| } |
| } |
|
|