sachin commited on
Commit
187ada4
·
1 Parent(s): af455be
Files changed (3) hide show
  1. app.py +253 -0
  2. requirements.txt +2 -0
  3. sample_resume.pdf +0 -0
app.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import tempfile
4
+ import dwani
5
+ import logging
6
+ import re
7
+
8
+ # Set up logging
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Configure dwani API settings from environment variables
13
+ dwani.api_key = os.getenv("DWANI_API_KEY")
14
+ dwani.api_base = os.getenv("DWANI_API_BASE_URL")
15
+
16
+ if not dwani.api_key or not dwani.api_base:
17
+ logger.error("API key or base URL not set. Please set DWANI_API_KEY and DWANI_API_BASE_URL environment variables.")
18
+ raise RuntimeError("Please set DWANI_API_KEY and DWANI_API_BASE_URL environment variables.")
19
+
20
+ # Language options for dropdowns (display name and code)
21
+ language_options = [
22
+ ("English", "eng_Latn"),
23
+ ("Kannada", "kan_Knda"),
24
+ ("Hindi", "hin_Deva")
25
+ ]
26
+
27
+ language_names = [lang[0] for lang in language_options]
28
+ lang_code_map = {lang[0]: lang[1] for lang in language_options}
29
+
30
+
31
+ def parse_page_numbers(pages_str):
32
+ """
33
+ Parse a string of comma-separated page numbers/ranges into a sorted list of unique integers.
34
+ Example inputs:
35
+ "1,3,5"
36
+ "1-3,5"
37
+ """
38
+ pages = set()
39
+ for part in pages_str.split(","):
40
+ part = part.strip()
41
+ if "-" in part:
42
+ try:
43
+ start, end = map(int, part.split("-"))
44
+ if start > end or start < 1:
45
+ continue
46
+ pages.update(range(start, end + 1))
47
+ except ValueError:
48
+ continue
49
+ else:
50
+ try:
51
+ page = int(part)
52
+ if page >= 1:
53
+ pages.add(page)
54
+ except ValueError:
55
+ continue
56
+ return sorted(pages)
57
+
58
+
59
+ def simple_format_resume(text):
60
+ """
61
+ Basic formatting for resume text:
62
+ - Convert '** text **' to bold <b>text</b>
63
+ - Replace multiple dots or underscores with horizontal lines
64
+ - Preserve line breaks with <br>
65
+ """
66
+ # Convert ** bold ** to <b>bold</b>
67
+ text = re.sub(r"\*\*\s*(.*?)\s*\*\*", r"<b>\1</b>", text)
68
+
69
+ # Replace long sequences of dots or underscores with <hr>
70
+ text = re.sub(r"([.\-_])\1{5,}", "<hr>", text)
71
+
72
+ # Preserve line breaks
73
+ text = text.replace('\r\n', '\n').replace('\r', '\n')
74
+ text = text.replace('\n', '<br>')
75
+
76
+ return text
77
+
78
+
79
+ def results_to_html(results):
80
+ """
81
+ Convert the results dictionary into an HTML formatted string,
82
+ applying basic formatting to preserve resume style.
83
+ """
84
+ html_lines = []
85
+ for page, content in results.items():
86
+ html_lines.append(f"<h2>{page}</h2>")
87
+ if "error" in content:
88
+ html_lines.append(f"<p style='color:red;'><b>Error:</b> {content['error']}</p>")
89
+ else:
90
+ html_lines.append("<h3>Original Text:</h3>")
91
+ original_html = simple_format_resume(content.get('Original Text', ''))
92
+ html_lines.append(f"<div style='background:#f0f0f0; padding:10px; border-radius:5px;'>{original_html}</div>")
93
+
94
+ response_text = content.get('Response', '')
95
+ if response_text:
96
+ html_lines.append(f"<p><b>Response:</b><br>{response_text}</p>")
97
+
98
+ html_lines.append(f"<p><b>Processed Page:</b> {content.get('Processed Page', '')}</p>")
99
+
100
+ translated_html = simple_format_resume(content.get('Translated Response', ''))
101
+ html_lines.append("<h3>Translated Resume:</h3>")
102
+ html_lines.append(f"<div style='background:#e8f5e9; padding:10px; border-radius:5px;'>{translated_html}</div>")
103
+
104
+ html_lines.append("<hr>")
105
+ return "\n".join(html_lines)
106
+
107
+
108
+ def results_to_markdown(results):
109
+ """
110
+ Convert the results dictionary into a Markdown formatted string,
111
+ preserving the translated resume content in code blocks.
112
+ """
113
+ md_lines = []
114
+ for page, content in results.items():
115
+ md_lines.append(f"## {page}\n")
116
+ if "error" in content:
117
+ md_lines.append(f"**Error:** {content['error']}\n")
118
+ else:
119
+ md_lines.append("**Original Text:**\n\n```")
120
+ md_lines.append(content.get('Original Text', '') + "\n")
121
+ md_lines.append("```\n")
122
+
123
+ response_text = content.get('Response', '')
124
+ if response_text:
125
+ md_lines.append("Response:\n\n" + response_text + "\n")
126
+
127
+ md_lines.append("**Processed Page:** " + str(content.get('Processed Page', '')) + "\n")
128
+
129
+ translated = content.get('Translated Response', '')
130
+ md_lines.append("**Translated Resume:**\n\n```")
131
+ md_lines.append(translated + "\n")
132
+ md_lines.append("```\n")
133
+
134
+ md_lines.append("\n---\n")
135
+ return "\n".join(md_lines)
136
+
137
+
138
+ def process_pdf(pdf_file, pages_str, prompt, src_lang, tgt_lang):
139
+ logger.info(f"Processing PDF: {pdf_file}, Pages: {pages_str}, Prompt: {prompt}, Source: {src_lang}, Target: {tgt_lang}")
140
+
141
+ if not pdf_file:
142
+ return "Error: Please upload a PDF file", None
143
+
144
+ if not prompt.strip():
145
+ return "Error: Please provide a non-empty prompt", None
146
+
147
+ pages = parse_page_numbers(pages_str)
148
+ if not pages:
149
+ return "Error: Please provide valid page numbers (e.g., 1,3,5 or 1-3)", None
150
+
151
+ src_lang_code = lang_code_map.get(src_lang)
152
+ tgt_lang_code = lang_code_map.get(tgt_lang)
153
+
154
+ if not src_lang_code or not tgt_lang_code:
155
+ return "Error: Invalid source or target language selection", None
156
+
157
+ file_path = pdf_file.name if hasattr(pdf_file, 'name') else pdf_file
158
+
159
+ results = {}
160
+ for page_number in pages:
161
+ try:
162
+ # Call Dwani API (without 'prompt' arg as it's unsupported)
163
+ result = dwani.Documents.run_extract(
164
+ file_path=file_path,
165
+ page_number=page_number,
166
+ src_lang=src_lang_code,
167
+ tgt_lang=tgt_lang_code
168
+ )
169
+ logger.debug(f"API response for page {page_number}: {result}")
170
+
171
+ page_data = None
172
+ for p in result.get('pages', []):
173
+ if p.get('processed_page') == page_number:
174
+ page_data = p
175
+ break
176
+
177
+ if page_data is None:
178
+ results[f"Page {page_number}"] = {"error": "No data returned for this page"}
179
+ continue
180
+
181
+ results[f"Page {page_number}"] = {
182
+ "Processed Page": page_data.get("processed_page", "N/A"),
183
+ "Original Text": page_data.get("page_content", "N/A"),
184
+ "Translated Response": page_data.get("translated_content", "N/A"),
185
+ "Response": ""
186
+ }
187
+ except dwani.exceptions.DwaniAPIError as e:
188
+ logger.error(f"Dwani API error on page {page_number}: {e}")
189
+ results[f"Page {page_number}"] = {"error": f"API error: {str(e)}"}
190
+ except Exception as e:
191
+ logger.error(f"Unexpected error on page {page_number}: {e}")
192
+ results[f"Page {page_number}"] = {"error": f"Unexpected error: {str(e)}"}
193
+
194
+ # Convert results to HTML for display
195
+ html_text = results_to_html(results)
196
+
197
+ # Save markdown for download
198
+ markdown_text = results_to_markdown(results)
199
+ temp_md_file = tempfile.NamedTemporaryFile(delete=False, suffix=".md", mode='w', encoding='utf-8')
200
+ temp_md_file.write(markdown_text)
201
+ temp_md_file.close()
202
+
203
+ return html_text, temp_md_file.name
204
+
205
+
206
+ # Gradio UI
207
+ with gr.Blocks(title="Resume Translator") as demo:
208
+ gr.Markdown("# Resume Translator")
209
+ gr.Markdown(
210
+ "Upload your resume PDF, specify pages to translate, enter a prompt (e.g., 'Translate the resume'), "
211
+ "and select source and target languages."
212
+ )
213
+
214
+ with gr.Row():
215
+ with gr.Column():
216
+ pdf_input = gr.File(label="Upload Resume PDF", file_types=[".pdf"])
217
+ pages_input = gr.Textbox(
218
+ label="Page Numbers",
219
+ placeholder="e.g., 1,3,5 or 1-3",
220
+ value="1",
221
+ lines=1
222
+ )
223
+ prompt_input = gr.Textbox(
224
+ label="Custom Prompt",
225
+ placeholder="e.g., Translate the resume",
226
+ value="Translate the resume",
227
+ lines=2
228
+ )
229
+ src_lang_input = gr.Dropdown(
230
+ label="Source Language",
231
+ choices=language_names,
232
+ value="English"
233
+ )
234
+ tgt_lang_input = gr.Dropdown(
235
+ label="Target Language",
236
+ choices=language_names,
237
+ value="Kannada"
238
+ )
239
+ submit_btn = gr.Button("Translate")
240
+
241
+ with gr.Column():
242
+ output_html = gr.HTML(label="Translated Resume Output")
243
+ download_md = gr.File(label="Download Translated Resume (Markdown)")
244
+
245
+ submit_btn.click(
246
+ fn=process_pdf,
247
+ inputs=[pdf_input, pages_input, prompt_input, src_lang_input, tgt_lang_input],
248
+ outputs=[output_html, download_md]
249
+ )
250
+
251
+
252
+ if __name__ == "__main__":
253
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio
2
+ dwani
sample_resume.pdf ADDED
Binary file (74.3 kB). View file