omkar-surve126 commited on
Commit
109676f
·
verified ·
1 Parent(s): 568477c

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -0
app.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from mistralai import Mistral
4
+ import os
5
+ import io
6
+ import zipfile
7
+
8
+ # --- Page Config ---
9
+ st.set_page_config(page_title="Mistral OCR Processor", layout="wide")
10
+
11
+ st.title("📄 OCR Extraction")
12
+ st.markdown("""
13
+ This tool allows you to upload multiple PDF files, process them using **Mistral's OCR**.
14
+ **Output:**
15
+ 1. A ZIP file containing full text files for each PDF.
16
+ 2. A CSV with every sentence having its metadata.
17
+ """)
18
+
19
+ # --- Sidebar: Configuration ---
20
+ with st.sidebar:
21
+ st.header("Configuration")
22
+ api_key = st.text_input("Enter Mistral API Key", type="password")
23
+ st.caption("Your key is not stored and is only used for this session.")
24
+
25
+ st.divider()
26
+ st.info("Ensure you have access to `mistral-ocr-latest`.")
27
+
28
+ # --- Helper Function: Process Single PDF ---
29
+ def process_pdf(client, uploaded_file):
30
+ """
31
+ Uploads file to Mistral, gets a signed URL, runs OCR, and extracts text.
32
+ """
33
+ try:
34
+ # 1. Upload the file to Mistral
35
+ # using .getvalue() to satisfy Pydantic strict typing
36
+ uploaded_mistral_file = client.files.upload(
37
+ file={
38
+ "file_name": uploaded_file.name,
39
+ "content": uploaded_file.getvalue(),
40
+ },
41
+ purpose="ocr"
42
+ )
43
+
44
+ # 2. Get a signed URL (temporary)
45
+ signed_url = client.files.get_signed_url(file_id=uploaded_mistral_file.id)
46
+
47
+ # 3. Process with Mistral OCR
48
+ ocr_response = client.ocr.process(
49
+ model="mistral-ocr-latest",
50
+ document={
51
+ "type": "document_url",
52
+ "document_url": signed_url.url
53
+ },
54
+ include_image_base64=False
55
+ )
56
+
57
+ # 4. Extract text from the response
58
+ full_text = ""
59
+ if hasattr(ocr_response, 'pages'):
60
+ for page in ocr_response.pages:
61
+ full_text += page.markdown + " " # Adding space to ensure sentences don't merge
62
+
63
+ return full_text
64
+
65
+ except Exception as e:
66
+ st.error(f"Error processing {uploaded_file.name}: {e}")
67
+ return None
68
+
69
+ # --- Main Interface ---
70
+
71
+ uploaded_files = st.file_uploader(
72
+ "Upload PDF files",
73
+ type=["pdf"],
74
+ accept_multiple_files=True
75
+ )
76
+
77
+ if st.button("Start OCR Processing", type="primary"):
78
+ if not api_key:
79
+ st.error("Please provide your Mistral API Key in the sidebar.")
80
+ elif not uploaded_files:
81
+ st.warning("Please upload at least one PDF file.")
82
+ else:
83
+ # Initialize Mistral Client
84
+ client = Mistral(api_key=api_key)
85
+
86
+ # Containers for results
87
+ processed_data = [] # For CSV
88
+ zip_buffer = io.BytesIO() # For ZIP of TXT files
89
+
90
+ # Progress Bar
91
+ progress_bar = st.progress(0)
92
+ status_text = st.empty()
93
+
94
+ with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zf:
95
+ for idx, file in enumerate(uploaded_files):
96
+ status_text.text(f"Processing file {idx + 1} of {len(uploaded_files)}: {file.name}...")
97
+
98
+ # Perform OCR
99
+ extracted_text = process_pdf(client, file)
100
+
101
+ if extracted_text:
102
+ # --- LOGIC CHANGE FOR CSV ---
103
+ # Split text by full stop to create rows per sentence
104
+ sentences = extracted_text.split('.')
105
+
106
+ for sentence in sentences:
107
+ # Clean whitespace
108
+ clean_sentence = sentence.strip()
109
+
110
+ # Only add if the sentence is not empty
111
+ if clean_sentence:
112
+ processed_data.append({
113
+ "text": clean_sentence,
114
+ "source_file": file.name
115
+ })
116
+
117
+ # --- LOGIC FOR ZIP (Keep Full Text) ---
118
+ txt_filename = f"{os.path.splitext(file.name)[0]}.txt"
119
+ zf.writestr(txt_filename, extracted_text)
120
+
121
+ # Update progress
122
+ progress_bar.progress((idx + 1) / len(uploaded_files))
123
+
124
+ status_text.text("Processing Complete!")
125
+
126
+ # --- Display Results & Downloads ---
127
+ if processed_data:
128
+ st.success(f"Successfully processed {len(uploaded_files)} files.")
129
+
130
+ # Create DataFrame
131
+ df = pd.DataFrame(processed_data)
132
+
133
+ # Layout for download buttons
134
+ col1, col2 = st.columns(2)
135
+
136
+ with col1:
137
+ st.subheader("1. Download Sentence-wise CSV")
138
+ csv = df.to_csv(index=False).encode('utf-8')
139
+ st.download_button(
140
+ label="Download CSV (Sentences)",
141
+ data=csv,
142
+ file_name="ocr_sentences.csv",
143
+ mime="text/csv",
144
+ )
145
+
146
+ with col2:
147
+ st.subheader("2. Download Full Text Files")
148
+ st.download_button(
149
+ label="Download All .txt Files (ZIP)",
150
+ data=zip_buffer.getvalue(),
151
+ file_name="individual_ocr_files.zip",
152
+ mime="application/zip"
153
+ )
154
+
155
+ # Preview Data
156
+ with st.expander("Preview Extracted Sentences"):
157
+ st.dataframe(df)