upload
Browse files- pdf_parser.py +18 -18
- requirements.txt +2 -1
pdf_parser.py
CHANGED
|
@@ -4,9 +4,9 @@ from scipdf_utils import parse_pdf_to_dict
|
|
| 4 |
|
| 5 |
|
| 6 |
class GrobidSciPDFPaser(AbstractPDFParser):
|
| 7 |
-
import pysbd
|
| 8 |
-
seg_en = pysbd.Segmenter(language="en", clean=False)
|
| 9 |
-
seg_chinese = pysbd.Segmenter(language="zh", clean=False)
|
| 10 |
|
| 11 |
def __init__(self, pdf_link, db_name="grobid_scipdf", short_thereshold=30) -> None:
|
| 12 |
"""Initialize the PDF parser
|
|
@@ -131,18 +131,18 @@ class GrobidSciPDFPaser(AbstractPDFParser):
|
|
| 131 |
})
|
| 132 |
return section_pair_list
|
| 133 |
|
| 134 |
-
@staticmethod
|
| 135 |
-
def _determine_optimal_split_of_pargraphs(section_pair_list) -> None:
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
class GrobidSciPDFPaser(AbstractPDFParser):
|
| 7 |
+
# import pysbd
|
| 8 |
+
# seg_en = pysbd.Segmenter(language="en", clean=False)
|
| 9 |
+
# seg_chinese = pysbd.Segmenter(language="zh", clean=False)
|
| 10 |
|
| 11 |
def __init__(self, pdf_link, db_name="grobid_scipdf", short_thereshold=30) -> None:
|
| 12 |
"""Initialize the PDF parser
|
|
|
|
| 131 |
})
|
| 132 |
return section_pair_list
|
| 133 |
|
| 134 |
+
# @staticmethod
|
| 135 |
+
# def _determine_optimal_split_of_pargraphs(section_pair_list) -> None:
|
| 136 |
+
# """
|
| 137 |
+
# split based on the some magic rules
|
| 138 |
+
# """
|
| 139 |
+
# import pysbd
|
| 140 |
+
# for section_pair in section_pair_list:
|
| 141 |
+
# if GrobidSciPDFPaser._check_chinese(section_pair["text"]):
|
| 142 |
+
# seg = GrobidSciPDFPaser.seg_chinese
|
| 143 |
+
# else:
|
| 144 |
+
# seg = GrobidSciPDFPaser.seg_en
|
| 145 |
+
# section_pair["texts"] = seg.segment(section_pair["texts"])
|
| 146 |
+
# section_pair["texts"] = [
|
| 147 |
+
# para for para in section_pair["text"] if len(para) > 2]
|
| 148 |
+
# return section_pair_list
|
requirements.txt
CHANGED
|
@@ -6,4 +6,5 @@ sentence_transformers
|
|
| 6 |
bs4
|
| 7 |
openai
|
| 8 |
matplotlib
|
| 9 |
-
plotly
|
|
|
|
|
|
| 6 |
bs4
|
| 7 |
openai
|
| 8 |
matplotlib
|
| 9 |
+
plotly
|
| 10 |
+
pysbd
|