Update app.py
Browse files
app.py
CHANGED
|
@@ -31,11 +31,16 @@ nltk.download('punkt')
|
|
| 31 |
nltk.download('punkt_tab')
|
| 32 |
nltk.download('stopwords')
|
| 33 |
|
| 34 |
-
# Create directories if they don't exist
|
| 35 |
def create_dirs_if_needed():
|
| 36 |
"""Create the necessary directories if they don't exist."""
|
| 37 |
-
os.
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
# Call the function at the start of your app
|
| 41 |
create_dirs_if_needed()
|
|
@@ -509,35 +514,46 @@ def is_query_relevant(question, source_documents, threshold=0.1):
|
|
| 509 |
except Exception as e:
|
| 510 |
logger.error(f"Error checking query relevance: {str(e)}", exc_info=True)
|
| 511 |
return False
|
|
|
|
| 512 |
def get_pdf_details(filename, page_number):
|
| 513 |
"""Get details of a specific PDF page."""
|
| 514 |
logger.info(f"Processing PDF details for file: {filename}, page: {page_number}")
|
| 515 |
try:
|
| 516 |
-
#
|
| 517 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 518 |
file_path = os.path.join(data_path, filename)
|
| 519 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
# Open the PDF
|
| 521 |
logger.debug(f"Opening PDF file: {file_path}")
|
| 522 |
doc = fitz.open(file_path)
|
| 523 |
-
|
| 524 |
# Extract full PDF text
|
| 525 |
full_text = ""
|
| 526 |
for page in doc:
|
| 527 |
full_text += page.get_text()
|
| 528 |
-
|
| 529 |
# Get PDF metadata
|
| 530 |
pdf_metadata = doc.metadata or {}
|
| 531 |
-
|
| 532 |
# Extract page text and render page image
|
| 533 |
page = doc.load_page(page_number)
|
| 534 |
page_text = page.get_text()
|
| 535 |
-
|
| 536 |
# Render page as image
|
| 537 |
pix = page.get_pixmap()
|
| 538 |
img_bytes = pix.tobytes("png")
|
| 539 |
page_image_base64 = base64.b64encode(img_bytes).decode('utf-8')
|
| 540 |
-
|
| 541 |
# Detect language
|
| 542 |
try:
|
| 543 |
lang_code = detect(page_text)
|
|
@@ -545,7 +561,7 @@ def get_pdf_details(filename, page_number):
|
|
| 545 |
except Exception as e:
|
| 546 |
logger.warning(f"Language detection failed: {str(e)}")
|
| 547 |
language = 'Unknown'
|
| 548 |
-
|
| 549 |
# Prepare response
|
| 550 |
return {
|
| 551 |
"file_path": file_path,
|
|
@@ -574,10 +590,20 @@ def get_romanized_text(filename):
|
|
| 574 |
"""Get romanized text from a PDF."""
|
| 575 |
logger.info(f"Processing romanized text for file: {filename}")
|
| 576 |
try:
|
| 577 |
-
#
|
| 578 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 579 |
file_path = os.path.join(data_path, filename)
|
| 580 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
# Open the PDF
|
| 582 |
logger.debug(f"Opening PDF file for romanization: {file_path}")
|
| 583 |
doc = fitz.open(file_path)
|
|
|
|
| 31 |
nltk.download('punkt_tab')
|
| 32 |
nltk.download('stopwords')
|
| 33 |
|
|
|
|
| 34 |
def create_dirs_if_needed():
|
| 35 |
"""Create the necessary directories if they don't exist."""
|
| 36 |
+
if os.path.exists('/tmp'):
|
| 37 |
+
# We're in Hugging Face space
|
| 38 |
+
os.makedirs('/tmp/data', exist_ok=True)
|
| 39 |
+
os.makedirs('/tmp/db', exist_ok=True)
|
| 40 |
+
else:
|
| 41 |
+
# Local environment
|
| 42 |
+
os.makedirs('data', exist_ok=True)
|
| 43 |
+
os.makedirs('db', exist_ok=True)
|
| 44 |
|
| 45 |
# Call the function at the start of your app
|
| 46 |
create_dirs_if_needed()
|
|
|
|
| 514 |
except Exception as e:
|
| 515 |
logger.error(f"Error checking query relevance: {str(e)}", exc_info=True)
|
| 516 |
return False
|
| 517 |
+
|
| 518 |
def get_pdf_details(filename, page_number):
|
| 519 |
"""Get details of a specific PDF page."""
|
| 520 |
logger.info(f"Processing PDF details for file: {filename}, page: {page_number}")
|
| 521 |
try:
|
| 522 |
+
# Check if running in Hugging Face space or locally
|
| 523 |
+
if os.path.exists('/tmp'):
|
| 524 |
+
data_path = '/tmp/data' # Hugging Face temporary storage
|
| 525 |
+
else:
|
| 526 |
+
data_path = 'data' # Local storage
|
| 527 |
+
|
| 528 |
file_path = os.path.join(data_path, filename)
|
| 529 |
|
| 530 |
+
# Ensure file exists
|
| 531 |
+
if not os.path.exists(file_path):
|
| 532 |
+
logger.error(f"File does not exist at {file_path}")
|
| 533 |
+
st.error(f"File not found at {file_path}")
|
| 534 |
+
return
|
| 535 |
+
|
| 536 |
# Open the PDF
|
| 537 |
logger.debug(f"Opening PDF file: {file_path}")
|
| 538 |
doc = fitz.open(file_path)
|
| 539 |
+
|
| 540 |
# Extract full PDF text
|
| 541 |
full_text = ""
|
| 542 |
for page in doc:
|
| 543 |
full_text += page.get_text()
|
| 544 |
+
|
| 545 |
# Get PDF metadata
|
| 546 |
pdf_metadata = doc.metadata or {}
|
| 547 |
+
|
| 548 |
# Extract page text and render page image
|
| 549 |
page = doc.load_page(page_number)
|
| 550 |
page_text = page.get_text()
|
| 551 |
+
|
| 552 |
# Render page as image
|
| 553 |
pix = page.get_pixmap()
|
| 554 |
img_bytes = pix.tobytes("png")
|
| 555 |
page_image_base64 = base64.b64encode(img_bytes).decode('utf-8')
|
| 556 |
+
|
| 557 |
# Detect language
|
| 558 |
try:
|
| 559 |
lang_code = detect(page_text)
|
|
|
|
| 561 |
except Exception as e:
|
| 562 |
logger.warning(f"Language detection failed: {str(e)}")
|
| 563 |
language = 'Unknown'
|
| 564 |
+
|
| 565 |
# Prepare response
|
| 566 |
return {
|
| 567 |
"file_path": file_path,
|
|
|
|
| 590 |
"""Get romanized text from a PDF."""
|
| 591 |
logger.info(f"Processing romanized text for file: {filename}")
|
| 592 |
try:
|
| 593 |
+
# Check if running in Hugging Face space or locally
|
| 594 |
+
if os.path.exists('/tmp'):
|
| 595 |
+
data_path = '/tmp/data' # Use Hugging Face's temp directory
|
| 596 |
+
else:
|
| 597 |
+
data_path = 'data' # Use local directory
|
| 598 |
+
|
| 599 |
file_path = os.path.join(data_path, filename)
|
| 600 |
|
| 601 |
+
# Ensure file exists
|
| 602 |
+
if not os.path.exists(file_path):
|
| 603 |
+
logger.error(f"File does not exist at {file_path}")
|
| 604 |
+
st.error(f"File not found at {file_path}")
|
| 605 |
+
return
|
| 606 |
+
|
| 607 |
# Open the PDF
|
| 608 |
logger.debug(f"Opening PDF file for romanization: {file_path}")
|
| 609 |
doc = fitz.open(file_path)
|