Spaces:
Sleeping
Sleeping
Commit
Β·
8fefe0f
1
Parent(s):
31cf81a
fixed more problems with the file uploading + processing
Browse files- document_processor.py +25 -2
document_processor.py
CHANGED
|
@@ -9,6 +9,7 @@ from llama_index.core.retrievers import VectorIndexRetriever
|
|
| 9 |
from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
|
| 10 |
from llama_index.core.prompts import PromptTemplate
|
| 11 |
from config import *
|
|
|
|
| 12 |
|
| 13 |
def log_message(message):
|
| 14 |
print(message, flush=True)
|
|
@@ -41,6 +42,16 @@ def process_uploaded_file(file_path, file_name, doc_name, doc_link):
|
|
| 41 |
try:
|
| 42 |
log_message(f"π Processing file: {file_name}")
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
file_extension = Path(file_path).suffix.lower()
|
| 45 |
|
| 46 |
if file_extension == '.pdf':
|
|
@@ -72,12 +83,24 @@ def process_uploaded_file(file_path, file_name, doc_name, doc_link):
|
|
| 72 |
|
| 73 |
def get_existing_documents():
|
| 74 |
try:
|
|
|
|
| 75 |
chunks_csv_path = os.path.join(download_dir, chunks_filename)
|
| 76 |
if os.path.exists(chunks_csv_path):
|
| 77 |
chunks_df = pd.read_csv(chunks_csv_path)
|
| 78 |
-
if not chunks_df.empty:
|
| 79 |
unique_docs = chunks_df['document_name'].unique()
|
| 80 |
-
return sorted(unique_docs.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
return []
|
| 82 |
except Exception as e:
|
| 83 |
log_message(f"β Error reading documents: {str(e)}")
|
|
|
|
| 9 |
from llama_index.core.response_synthesizers import get_response_synthesizer, ResponseMode
|
| 10 |
from llama_index.core.prompts import PromptTemplate
|
| 11 |
from config import *
|
| 12 |
+
import shutil
|
| 13 |
|
| 14 |
def log_message(message):
|
| 15 |
print(message, flush=True)
|
|
|
|
| 42 |
try:
|
| 43 |
log_message(f"π Processing file: {file_name}")
|
| 44 |
|
| 45 |
+
# Create upload directory if it doesn't exist
|
| 46 |
+
upload_dir = "UPLOADED_DOCUMENTS"
|
| 47 |
+
os.makedirs(upload_dir, exist_ok=True)
|
| 48 |
+
|
| 49 |
+
# Copy uploaded file to permanent location
|
| 50 |
+
permanent_file_path = os.path.join(upload_dir, file_name)
|
| 51 |
+
if os.path.abspath(file_path) != os.path.abspath(permanent_file_path):
|
| 52 |
+
shutil.copy2(file_path, permanent_file_path)
|
| 53 |
+
log_message(f"π File saved to: {permanent_file_path}")
|
| 54 |
+
|
| 55 |
file_extension = Path(file_path).suffix.lower()
|
| 56 |
|
| 57 |
if file_extension == '.pdf':
|
|
|
|
| 83 |
|
| 84 |
def get_existing_documents():
|
| 85 |
try:
|
| 86 |
+
# First check CSV file for processed documents
|
| 87 |
chunks_csv_path = os.path.join(download_dir, chunks_filename)
|
| 88 |
if os.path.exists(chunks_csv_path):
|
| 89 |
chunks_df = pd.read_csv(chunks_csv_path)
|
| 90 |
+
if not chunks_df.empty and 'document_name' in chunks_df.columns:
|
| 91 |
unique_docs = chunks_df['document_name'].unique()
|
| 92 |
+
return sorted([doc for doc in unique_docs if pd.notna(doc)])
|
| 93 |
+
|
| 94 |
+
# Fallback to checking uploaded files directory
|
| 95 |
+
upload_dir = "UPLOADED_DOCUMENTS"
|
| 96 |
+
if os.path.exists(upload_dir):
|
| 97 |
+
documents = []
|
| 98 |
+
for file_name in os.listdir(upload_dir):
|
| 99 |
+
if file_name.endswith(('.txt', '.pdf')):
|
| 100 |
+
doc_name = os.path.splitext(file_name)[0]
|
| 101 |
+
documents.append(doc_name)
|
| 102 |
+
return sorted(documents)
|
| 103 |
+
|
| 104 |
return []
|
| 105 |
except Exception as e:
|
| 106 |
log_message(f"β Error reading documents: {str(e)}")
|