Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -76,30 +76,9 @@ def run_GOT(pdf_file):
|
|
| 76 |
|
| 77 |
res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
|
| 78 |
|
| 79 |
-
# Read the rendered HTML content
|
| 80 |
-
with open(result_path, 'r') as f:
|
| 81 |
-
html_content = f.read()
|
| 82 |
-
|
| 83 |
-
# Parse the HTML and ensure newlines are preserved
|
| 84 |
-
soup = BeautifulSoup(html_content, 'html.parser')
|
| 85 |
-
|
| 86 |
-
# Extract the text content and ensure newlines are preserved
|
| 87 |
-
text_content = soup.find('div', id='content-text').get_text(separator=' ', strip=True)
|
| 88 |
-
|
| 89 |
-
# Modify the HTML to include newlines in the script
|
| 90 |
-
script_tag = soup.find('script', string=lambda x: 'const text =' in x)
|
| 91 |
-
if script_tag:
|
| 92 |
-
# Replace newlines with escaped newlines in the JavaScript string
|
| 93 |
-
escaped_text = res.replace('\n', ' ')
|
| 94 |
-
script_tag.string = f"const text = \"{escaped_text}\""
|
| 95 |
-
|
| 96 |
-
# Convert the modified BeautifulSoup object back to a string
|
| 97 |
-
formatted_html_content = soup.prettify()
|
| 98 |
-
|
| 99 |
results.append({
|
| 100 |
"page_number": i + 1,
|
| 101 |
-
"text": res
|
| 102 |
-
"html": formatted_html_content
|
| 103 |
})
|
| 104 |
|
| 105 |
if os.path.exists(image_path):
|
|
|
|
| 76 |
|
| 77 |
res = model.chat_crop(tokenizer, image_path, ocr_type='format', render=True, save_render_file=result_path)
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
results.append({
|
| 80 |
"page_number": i + 1,
|
| 81 |
+
"text": res # Directly use the output from model.chat_crop
|
|
|
|
| 82 |
})
|
| 83 |
|
| 84 |
if os.path.exists(image_path):
|