AzizWazir commited on
Commit
7f705e7
·
verified ·
1 Parent(s): e17d9e8

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -86
app.py DELETED
@@ -1,86 +0,0 @@
1
- import streamlit as st
2
- from pdf2image import convert_from_path
3
- import pytesseract
4
- import os
5
- from pathlib import Path
6
- from sentence_transformers import SentenceTransformer
7
- import pinecone
8
- import tempfile
9
- import shutil
10
-
11
- # Initialize Streamlit app
12
- st.title("PDF Image to Text/Word Converter with Search Capability")
13
- st.write("Upload your PDF to extract text or Word document and search content within it.")
14
-
15
- # Create a temporary directory
16
- temp_dir = tempfile.mkdtemp()
17
-
18
- # Upload PDF file
19
- uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
20
-
21
- if uploaded_file:
22
- pdf_path = Path(temp_dir) / uploaded_file.name
23
- with open(pdf_path, "wb") as f:
24
- f.write(uploaded_file.read())
25
-
26
- st.write("File uploaded successfully!")
27
-
28
- # Convert PDF pages to images
29
- st.write("Converting PDF to images...")
30
- images = convert_from_path(pdf_path, output_folder=temp_dir, poppler_path="/usr/bin/poppler") # Update path if needed
31
-
32
- # Extract text from images
33
- st.write("Extracting text from images...")
34
- extracted_text = ""
35
- for idx, image in enumerate(images):
36
- st.image(image, caption=f"Page {idx + 1}", use_column_width=True)
37
- text = pytesseract.image_to_string(image)
38
- extracted_text += text + "\n"
39
-
40
- # Save extracted text to a .txt file
41
- text_file_path = Path(temp_dir) / "extracted_text.txt"
42
- with open(text_file_path, "w", encoding="utf-8") as text_file:
43
- text_file.write(extracted_text)
44
-
45
- st.success("Text extraction complete!")
46
-
47
- # Option to download text file
48
- st.download_button(
49
- label="Download Extracted Text",
50
- data=extracted_text,
51
- file_name="extracted_text.txt",
52
- mime="text/plain",
53
- )
54
-
55
- # Initialize vector model and Pinecone
56
- st.write("Initializing vector search...")
57
- model = SentenceTransformer('all-MiniLM-L6-v2')
58
- pinecone.init(api_key="YOUR_PINECONE_API_KEY", environment="us-west1-gcp") # Replace with your Pinecone details
59
-
60
- # Create Pinecone index if it doesn't exist
61
- index_name = "pdf-text-index"
62
- if index_name not in pinecone.list_indexes():
63
- pinecone.create_index(index_name, dimension=384)
64
- index = pinecone.Index(index_name)
65
-
66
- # Generate vector embeddings and upload to Pinecone
67
- st.write("Generating vector embeddings...")
68
- sentences = extracted_text.split("\n")
69
- embeddings = model.encode(sentences)
70
- for i, embedding in enumerate(embeddings):
71
- index.upsert([(f"sentence-{i}", embedding, {"sentence": sentences[i]})])
72
-
73
- # Search functionality
74
- st.write("Search within the extracted text")
75
- query = st.text_input("Enter your query:")
76
- if query:
77
- query_vector = model.encode([query])[0]
78
- results = index.query(query_vector, top_k=5, include_metadata=True)
79
- st.write("Top results:")
80
- for match in results["matches"]:
81
- st.write(f"Score: {match['score']}, Text: {match['metadata']['sentence']}")
82
-
83
- # Cleanup temporary files
84
- if st.button("Clean Temporary Files"):
85
- shutil.rmtree(temp_dir)
86
- st.success("Temporary files cleaned!")