AzizWazir commited on
Commit
a4fb3b6
·
verified ·
1 Parent(s): 7f705e7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -0
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pdf2image import convert_from_path
3
+ import pytesseract
4
+ import os
5
+ from pathlib import Path
6
+ from sentence_transformers import SentenceTransformer
7
+ import pinecone
8
+ import tempfile
9
+ import shutil
10
+
11
+ # Dynamically add Poppler to PATH if necessary
12
+ os.environ["PATH"] += os.pathsep + "C:\\Program Files\\poppler-24.08.0\\bin"
13
+
14
+ # Initialize Streamlit app
15
+ st.title("PDF Image to Text/Word Converter with Search Capability")
16
+ st.write("Upload your PDF to extract text or Word document and search content within it.")
17
+
18
+ # Create a temporary directory
19
+ temp_dir = tempfile.mkdtemp()
20
+
21
+ # Upload PDF file
22
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
23
+
24
+ if uploaded_file:
25
+ pdf_path = Path(temp_dir) / uploaded_file.name
26
+ with open(pdf_path, "wb") as f:
27
+ f.write(uploaded_file.read())
28
+
29
+ st.write("File uploaded successfully!")
30
+
31
+ # Convert PDF pages to images
32
+ st.write("Converting PDF to images...")
33
+ images = convert_from_path(pdf_path, output_folder=temp_dir)
34
+
35
+ # Extract text from images
36
+ st.write("Extracting text from images...")
37
+ extracted_text = ""
38
+ for idx, image in enumerate(images):
39
+ st.image(image, caption=f"Page {idx + 1}", use_column_width=True)
40
+ text = pytesseract.image_to_string(image)
41
+ extracted_text += text + "\n"
42
+
43
+ # Save extracted text to a .txt file
44
+ text_file_path = Path(temp_dir) / "extracted_text.txt"
45
+ with open(text_file_path, "w", encoding="utf-8") as text_file:
46
+ text_file.write(extracted_text)
47
+
48
+ st.success("Text extraction complete!")
49
+
50
+ # Option to download text file
51
+ st.download_button(
52
+ label="Download Extracted Text",
53
+ data=extracted_text,
54
+ file_name="extracted_text.txt",
55
+ mime="text/plain",
56
+ )
57
+
58
+ # Initialize vector model and Pinecone
59
+ st.write("Initializing vector search...")
60
+ model = SentenceTransformer('all-MiniLM-L6-v2')
61
+ pinecone.init(api_key="YOUR_PINECONE_API_KEY", environment="us-west1-gcp") # Replace with your Pinecone details
62
+
63
+ # Create Pinecone index if it doesn't exist
64
+ index_name = "pdf-text-index"
65
+ if index_name not in pinecone.list_indexes():
66
+ pinecone.create_index(index_name, dimension=384)
67
+ index = pinecone.Index(index_name)
68
+
69
+ # Generate vector embeddings and upload to Pinecone
70
+ st.write("Generating vector embeddings...")
71
+ sentences = extracted_text.split("\n")
72
+ embeddings = model.encode(sentences)
73
+ for i, embedding in enumerate(embeddings):
74
+ index.upsert([(f"sentence-{i}", embedding, {"sentence": sentences[i]})])
75
+
76
+ # Search functionality
77
+ st.write("Search within the extracted text")
78
+ query = st.text_input("Enter your query:")
79
+ if query:
80
+ query_vector = model.encode([query])[0]
81
+ results = index.query(query_vector, top_k=5, include_metadata=True)
82
+ st.write("Top results:")
83
+ for match in results["matches"]:
84
+ st.write(f"Score: {match['score']}, Text: {match['metadata']['sentence']}")
85
+
86
+ # Cleanup temporary files
87
+ if st.button("Clean Temporary Files"):
88
+ shutil.rmtree(temp_dir)
89
+ st.success("Temporary files cleaned!")