Spaces:
Sleeping
Sleeping
T-K-O-H
commited on
Commit
·
3076d04
1
Parent(s):
efde092
Update requirements.txts
Browse files
app.py
CHANGED
|
@@ -105,35 +105,39 @@ def extract_text_from_pdf(pdf_file):
|
|
| 105 |
# Sidebar for document upload
|
| 106 |
with st.sidebar:
|
| 107 |
st.header("Document Management")
|
| 108 |
-
uploaded_file = st.file_uploader("Upload a document", type=["txt", "pdf"])
|
| 109 |
if uploaded_file:
|
| 110 |
try:
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
text = extract_text_from_pdf(uploaded_file)
|
| 115 |
else:
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
except Exception as e:
|
| 138 |
logger.error(f"Error processing document: {str(e)}")
|
| 139 |
st.error(f"Error processing document: {str(e)}")
|
|
|
|
| 105 |
# Sidebar for document upload
|
| 106 |
with st.sidebar:
|
| 107 |
st.header("Document Management")
|
| 108 |
+
uploaded_file = st.file_uploader("Upload a document (max 10MB)", type=["txt", "pdf"])
|
| 109 |
if uploaded_file:
|
| 110 |
try:
|
| 111 |
+
# Check file size (10MB = 10 * 1024 * 1024 bytes)
|
| 112 |
+
if uploaded_file.size > 10 * 1024 * 1024:
|
| 113 |
+
st.error("File size exceeds 10MB limit. Please upload a smaller file.")
|
|
|
|
| 114 |
else:
|
| 115 |
+
logger.info(f"Processing uploaded file: {uploaded_file.name}")
|
| 116 |
+
# Process the document based on file type
|
| 117 |
+
if uploaded_file.type == "application/pdf":
|
| 118 |
+
text = extract_text_from_pdf(uploaded_file)
|
| 119 |
+
else:
|
| 120 |
+
# For text files, detect encoding
|
| 121 |
+
raw_data = uploaded_file.getvalue()
|
| 122 |
+
result = chardet.detect(raw_data)
|
| 123 |
+
encoding = result['encoding']
|
| 124 |
+
text = raw_data.decode(encoding)
|
| 125 |
+
|
| 126 |
+
if not text.strip():
|
| 127 |
+
raise ValueError("No text content found in the document")
|
| 128 |
+
|
| 129 |
+
# Process text into semantic chunks
|
| 130 |
+
chunks = process_text(text)
|
| 131 |
+
|
| 132 |
+
if not chunks:
|
| 133 |
+
raise ValueError("No valid text chunks could be created from the document")
|
| 134 |
+
|
| 135 |
+
# Add to vectorstore
|
| 136 |
+
logger.info(f"Adding {len(chunks)} chunks to vectorstore")
|
| 137 |
+
vectorstore.add_texts(chunks)
|
| 138 |
+
|
| 139 |
+
st.success("Document processed and added to the knowledge base!")
|
| 140 |
+
st.info(f"Processed {len(chunks)} chunks of text")
|
| 141 |
except Exception as e:
|
| 142 |
logger.error(f"Error processing document: {str(e)}")
|
| 143 |
st.error(f"Error processing document: {str(e)}")
|