Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,14 +6,6 @@ from pathlib import Path
|
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
import pinecone
|
| 8 |
import tempfile
|
| 9 |
-
import shuimport streamlit as st
|
| 10 |
-
from pdf2image import convert_from_path
|
| 11 |
-
import pytesseract
|
| 12 |
-
import os
|
| 13 |
-
from pathlib import Path
|
| 14 |
-
from sentence_transformers import SentenceTransformer
|
| 15 |
-
import pinecone
|
| 16 |
-
import tempfile
|
| 17 |
import shutil
|
| 18 |
|
| 19 |
# Debug: Check PATH and pdfinfo accessibility
|
|
@@ -103,97 +95,3 @@ if uploaded_file:
|
|
| 103 |
if st.button("Clean Temporary Files"):
|
| 104 |
shutil.rmtree(temp_dir)
|
| 105 |
st.success("Temporary files cleaned!")
|
| 106 |
-
til
|
| 107 |
-
|
| 108 |
-
# Debug: Display current PATH
|
| 109 |
-
st.write("System PATH:", os.environ["PATH"])
|
| 110 |
-
|
| 111 |
-
# Dynamically add Poppler to PATH if necessary
|
| 112 |
-
poppler_path = "C:\\Program Files\\poppler-24.08.0\\bin"
|
| 113 |
-
if poppler_path not in os.environ["PATH"]:
|
| 114 |
-
os.environ["PATH"] += os.pathsep + poppler_path
|
| 115 |
-
|
| 116 |
-
# Debug: Confirm Poppler is accessible
|
| 117 |
-
if shutil.which("pdfinfo") is None:
|
| 118 |
-
st.error("Poppler's pdfinfo executable not found in PATH!")
|
| 119 |
-
|
| 120 |
-
# Initialize Streamlit app
|
| 121 |
-
st.title("PDF Image to Text/Word Converter with Search Capability")
|
| 122 |
-
st.write("Upload your PDF to extract text or Word document and search content within it.")
|
| 123 |
-
|
| 124 |
-
# Create a temporary directory
|
| 125 |
-
temp_dir = tempfile.mkdtemp()
|
| 126 |
-
|
| 127 |
-
# Upload PDF file
|
| 128 |
-
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
|
| 129 |
-
|
| 130 |
-
if uploaded_file:
|
| 131 |
-
pdf_path = Path(temp_dir) / uploaded_file.name
|
| 132 |
-
with open(pdf_path, "wb") as f:
|
| 133 |
-
f.write(uploaded_file.read())
|
| 134 |
-
|
| 135 |
-
st.write("File uploaded successfully!")
|
| 136 |
-
|
| 137 |
-
# Convert PDF pages to images
|
| 138 |
-
st.write("Converting PDF to images...")
|
| 139 |
-
try:
|
| 140 |
-
images = convert_from_path(pdf_path, output_folder=temp_dir)
|
| 141 |
-
except Exception as e:
|
| 142 |
-
st.error(f"Error during PDF to image conversion: {e}")
|
| 143 |
-
st.stop()
|
| 144 |
-
|
| 145 |
-
# Extract text from images
|
| 146 |
-
st.write("Extracting text from images...")
|
| 147 |
-
extracted_text = ""
|
| 148 |
-
for idx, image in enumerate(images):
|
| 149 |
-
st.image(image, caption=f"Page {idx + 1}", use_column_width=True)
|
| 150 |
-
text = pytesseract.image_to_string(image)
|
| 151 |
-
extracted_text += text + "\n"
|
| 152 |
-
|
| 153 |
-
# Save extracted text to a .txt file
|
| 154 |
-
text_file_path = Path(temp_dir) / "extracted_text.txt"
|
| 155 |
-
with open(text_file_path, "w", encoding="utf-8") as text_file:
|
| 156 |
-
text_file.write(extracted_text)
|
| 157 |
-
|
| 158 |
-
st.success("Text extraction complete!")
|
| 159 |
-
|
| 160 |
-
# Option to download text file
|
| 161 |
-
st.download_button(
|
| 162 |
-
label="Download Extracted Text",
|
| 163 |
-
data=extracted_text,
|
| 164 |
-
file_name="extracted_text.txt",
|
| 165 |
-
mime="text/plain",
|
| 166 |
-
)
|
| 167 |
-
|
| 168 |
-
# Initialize vector model and Pinecone
|
| 169 |
-
st.write("Initializing vector search...")
|
| 170 |
-
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 171 |
-
pinecone.init(api_key="YOUR_PINECONE_API_KEY", environment="us-west1-gcp") # Replace with your Pinecone details
|
| 172 |
-
|
| 173 |
-
# Create Pinecone index if it doesn't exist
|
| 174 |
-
index_name = "pdf-text-index"
|
| 175 |
-
if index_name not in pinecone.list_indexes():
|
| 176 |
-
pinecone.create_index(index_name, dimension=384)
|
| 177 |
-
index = pinecone.Index(index_name)
|
| 178 |
-
|
| 179 |
-
# Generate vector embeddings and upload to Pinecone
|
| 180 |
-
st.write("Generating vector embeddings...")
|
| 181 |
-
sentences = extracted_text.split("\n")
|
| 182 |
-
embeddings = model.encode(sentences)
|
| 183 |
-
for i, embedding in enumerate(embeddings):
|
| 184 |
-
index.upsert([(f"sentence-{i}", embedding, {"sentence": sentences[i]})])
|
| 185 |
-
|
| 186 |
-
# Search functionality
|
| 187 |
-
st.write("Search within the extracted text")
|
| 188 |
-
query = st.text_input("Enter your query:")
|
| 189 |
-
if query:
|
| 190 |
-
query_vector = model.encode([query])[0]
|
| 191 |
-
results = index.query(query_vector, top_k=5, include_metadata=True)
|
| 192 |
-
st.write("Top results:")
|
| 193 |
-
for match in results["matches"]:
|
| 194 |
-
st.write(f"Score: {match['score']}, Text: {match['metadata']['sentence']}")
|
| 195 |
-
|
| 196 |
-
# Cleanup temporary files
|
| 197 |
-
if st.button("Clean Temporary Files"):
|
| 198 |
-
shutil.rmtree(temp_dir)
|
| 199 |
-
st.success("Temporary files cleaned!")
|
|
|
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
import pinecone
|
| 8 |
import tempfile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
import shutil
|
| 10 |
|
| 11 |
# Debug: Check PATH and pdfinfo accessibility
|
|
|
|
| 95 |
if st.button("Clean Temporary Files"):
|
| 96 |
shutil.rmtree(temp_dir)
|
| 97 |
st.success("Temporary files cleaned!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|