Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from groq import Groq | |
| import requests | |
| import numpy as np | |
| from typing import Tuple, Dict | |
| import os | |
| # Initialize Groq client | |
| client = Groq(api_key=os.environ.get("GROQ_API_KEY")) | |
| # Prompts for Groq | |
| TRANSLATION_PROMPT = """You are an expert policy document translator specializing in converting complex policy language into clear, plain language that average citizens can easily understand. | |
| **Your Task:** | |
| Transform the following policy document into plain language while: | |
| 1. Preserving all critical information and legal requirements | |
| 2. Using simple, everyday words (avoid jargon unless absolutely necessary) | |
| 3. Breaking down complex sentences into shorter, clearer ones | |
| 4. Using active voice instead of passive voice | |
| 5. Explaining technical terms when they must be used | |
| 6. Organizing information logically with clear headings | |
| 7. Using bullet points or numbered lists for clarity when appropriate | |
| **Guidelines:** | |
| - Target reading level: 8th grade | |
| - Sentence length: Aim for 15-20 words per sentence | |
| - Use "you" and "your" to make it personal | |
| - Replace legal jargon with everyday equivalents | |
| - If a term must be kept, provide a brief explanation in parentheses | |
| **Original Policy Document:** | |
| {document} | |
| **Additional Context (if provided):** | |
| Jurisdiction: {jurisdiction} | |
| Special terms to explain: {jargon} | |
| **Please provide the plain language version below:**""" | |
| IMPROVEMENT_PROMPT = """The previous plain language translation has a similarity score of {similarity:.2f} with the back-translated version, which indicates potential information loss or changes in meaning. | |
| **Your Task:** | |
| Improve the plain language translation to better preserve the original meaning while maintaining clarity. Focus on: | |
| 1. Ensuring all key facts and requirements are preserved | |
| 2. Maintaining simple language but being more precise | |
| 3. Checking that numbers, dates, and specific requirements match the original | |
| 4. Clarifying any ambiguous statements | |
| **Original Policy Document:** | |
| {document} | |
| **Previous Plain Language Version:** | |
| {plain_language} | |
| **Issues Identified:** | |
| - Similarity score is below threshold (target: 0.9, current: {similarity:.2f}) | |
| - This suggests potential meaning drift or information loss | |
| **Additional Context:** | |
| Jurisdiction: {jurisdiction} | |
| Special terms to explain: {jargon} | |
| **Please provide an improved plain language version:**""" | |
| def chat_with_groq(prompt: str, use_tools: bool = False) -> str: | |
| """Chat with Groq API""" | |
| try: | |
| messages = [{"role": "user", "content": prompt}] | |
| kwargs = { | |
| "model": "groq/compound", | |
| "messages": messages, | |
| "temperature": 0.5, | |
| "max_completion_tokens": 8000, | |
| "top_p": 1, | |
| "stream": True, | |
| "stop": None, | |
| } | |
| if use_tools: | |
| kwargs["compound_custom"] = { | |
| "tools": {"enabled_tools": ["visit_website", "web_search"]} | |
| } | |
| completion = client.chat.completions.create(**kwargs) | |
| response = "" | |
| for chunk in completion: | |
| if chunk.choices[0].delta.content: | |
| response += chunk.choices[0].delta.content | |
| return response.strip() | |
| except Exception as e: | |
| raise Exception(f"Groq API error: {str(e)}") | |
| def translate_text(text: str, source: str, target: str) -> str: | |
| """Translate text using LibreTranslate API""" | |
| try: | |
| response = requests.post( | |
| "https://jeff86-libretranslate.hf.space/translate", | |
| json={ | |
| "q": text, | |
| "source": source, | |
| "target": target, | |
| "format": "text", | |
| "alternatives": 0, | |
| "api_key": "" | |
| }, | |
| headers={"Content-Type": "application/json"}, | |
| timeout=60 | |
| ) | |
| response.raise_for_status() | |
| result = response.json() | |
| return result.get("translatedText", "") | |
| except Exception as e: | |
| raise Exception(f"Translation error: {str(e)}") | |
| def get_embedding(text: str) -> np.ndarray: | |
| """Get embedding using Qwen3-Embedding-0.6B API""" | |
| try: | |
| response = requests.post( | |
| "https://fahmiaziz-api-embedding.hf.space/api/v1/embeddings", | |
| json={"input": [text],"model": "qwen3-0.6b"}, | |
| headers={"Content-Type": "application/json"}, | |
| timeout=60 | |
| ) | |
| response.raise_for_status() | |
| result = response.json() | |
| return np.array(result.get("data", [])[0].get("embedding", [])) | |
| except Exception as e: | |
| raise Exception(f"Embedding error: {str(e)}") | |
| def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float: | |
| """Calculate cosine similarity between two vectors""" | |
| if len(vec1) == 0 or len(vec2) == 0: | |
| return 0.0 | |
| dot_product = np.dot(vec1, vec2) | |
| norm1 = np.linalg.norm(vec1) | |
| norm2 = np.linalg.norm(vec2) | |
| if norm1 == 0 or norm2 == 0: | |
| return 0.0 | |
| return float(dot_product / (norm1 * norm2)) | |
| def process_document( | |
| policy_text: str, | |
| jurisdiction: str = "", | |
| jargon_terms: str = "", | |
| max_iterations: int = 3 | |
| ) -> Tuple[str, str, float, str]: | |
| """ | |
| Process policy document and translate to plain language with quality verification | |
| Returns: (plain_language_text, status_message, similarity_score, details) | |
| """ | |
| if not policy_text.strip(): | |
| return "", "⚠️ Please provide a policy document to translate.", 0.0, "" | |
| details = "### Processing Steps:\n\n" | |
| # Step 1: Initial translation | |
| details += "**1. Translating to plain language...**\n" | |
| prompt = TRANSLATION_PROMPT.format( | |
| document=policy_text, | |
| jurisdiction=jurisdiction if jurisdiction else "Not specified", | |
| jargon=jargon_terms if jargon_terms else "None specified" | |
| ) | |
| try: | |
| plain_language = chat_with_groq(prompt) | |
| details += f"✓ Initial translation completed ({len(plain_language)} characters)\n\n" | |
| except Exception as e: | |
| return "", f"❌ Error during translation: {str(e)}", 0.0, details | |
| iteration = 0 | |
| best_similarity = 0.0 | |
| best_translation = plain_language | |
| while iteration < max_iterations: | |
| iteration += 1 | |
| details += f"**Iteration {iteration}:**\n" | |
| # Step 2: Translate to Chinese | |
| details += "- Translating to Chinese for verification...\n" | |
| try: | |
| chinese_text = translate_text(plain_language, "en", "zh") | |
| details += f" ✓ Chinese translation: {len(chinese_text)} characters\n" | |
| except Exception as e: | |
| details += f" ❌ Translation to Chinese failed: {str(e)}\n" | |
| break | |
| # Step 3: Translate back to English | |
| details += "- Translating back to English...\n" | |
| try: | |
| back_translated = translate_text(chinese_text, "zh", "en") | |
| details += f" ✓ Back-translation completed: {len(back_translated)} characters\n" | |
| except Exception as e: | |
| details += f" ❌ Back-translation failed: {str(e)}\n" | |
| break | |
| # Step 4: Calculate similarity | |
| details += "- Calculating semantic similarity...\n" | |
| try: | |
| emb1 = get_embedding(plain_language) | |
| emb2 = get_embedding(back_translated) | |
| similarity = cosine_similarity(emb1, emb2) | |
| if similarity > best_similarity: | |
| best_similarity = similarity | |
| best_translation = plain_language | |
| details += f" ✓ Similarity score: **{similarity:.4f}**\n" | |
| except Exception as e: | |
| details += f" ❌ Similarity calculation failed: {str(e)}\n" | |
| break | |
| # Check if similarity is acceptable | |
| if similarity >= 0.8: | |
| details += f"\n✅ **Quality threshold met!** (similarity: {similarity:.4f} ≥ 0.8)\n" | |
| status = f"✅ Translation successful! Similarity: {similarity:.4f}" | |
| return plain_language, status, similarity, details | |
| else: | |
| details += f" ⚠️ Below threshold (target: 0.8, current: {similarity:.4f})\n" | |
| if iteration < max_iterations: | |
| # Step 5: Request improvement | |
| details += "- Requesting improved translation from Groq...\n" | |
| improvement_prompt = IMPROVEMENT_PROMPT.format( | |
| document=policy_text, | |
| plain_language=plain_language, | |
| similarity=similarity, | |
| jurisdiction=jurisdiction if jurisdiction else "Not specified", | |
| jargon=jargon_terms if jargon_terms else "None specified" | |
| ) | |
| try: | |
| plain_language = chat_with_groq(improvement_prompt) | |
| details += f" ✓ Improved translation generated\n\n" | |
| except Exception as e: | |
| details += f" ❌ Improvement failed: {str(e)}\n" | |
| break | |
| else: | |
| details += f"\n⚠️ **Maximum iterations reached.** Best similarity: {best_similarity:.4f}\n" | |
| if best_similarity < 0.8: | |
| status = f"⚠️ Translation completed but similarity ({best_similarity:.4f}) is below threshold. Consider adding more context." | |
| else: | |
| status = f"✅ Translation successful! Final similarity: {best_similarity:.4f}" | |
| return best_translation, status, best_similarity, details | |
| # Gradio Interface | |
| with gr.Blocks(title="Policy to Plain Language Translator") as demo: | |
| gr.Markdown(""" | |
| # 📜 Policy to Plain Language Translator | |
| Transform complex policy documents into clear, understandable language using AI-powered translation and quality verification. | |
| ### How it works: | |
| 1. **Plain Language Translation**: Groq AI converts your policy document into simple, clear language | |
| 2. **Quality Verification**: The text is translated using old fashion machine translation to Chinese and back to English. Qwen3 embeddings verify the meaning is preserved (target: ≥0.8 similarity) after dummy machine translations verify the plain language is easy to interpret | |
| 3. **Iterative Improvement**: If quality is low, the AI refines the plain language automatically (up to 3 iterations) | |
| ### 📋 Best Practices: | |
| - **Use English policy documents** for best results | |
| - **Use Markdown format** ([Learn Markdown basics](https://www.markdownguide.org/basic-syntax/)) | |
| - Provide jurisdiction and jargon terms for better context | |
| - Demo does not take long document, [notify us if you want paid demo](https://huggingface.co/spaces/npc0/policy-2-plain-language/discussions/new) | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| policy_input = gr.Textbox( | |
| label="Policy Document (English, Markdown format recommended)", | |
| placeholder="Paste your policy document here...\n\nExample:\n# Data Privacy Policy\n\n## Section 1: Data Collection\nThe organization shall collect personally identifiable information (PII) pursuant to applicable regulations...", | |
| lines=15, | |
| max_lines=20 | |
| ) | |
| with gr.Accordion("📍 Optional: Additional Context", open=False): | |
| jurisdiction_input = gr.Textbox( | |
| label="Jurisdiction (e.g., 'California, USA', 'EU', 'Ontario, Canada')", | |
| placeholder="Specify the legal jurisdiction if relevant...", | |
| lines=1 | |
| ) | |
| jargon_input = gr.Textbox( | |
| label="Jargon Terms to Explain (comma-separated)", | |
| placeholder="e.g., 'PII, GDPR, data controller, consent decree'", | |
| lines=2 | |
| ) | |
| translate_btn = gr.Button("🔄 Translate to Plain Language", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| plain_output = gr.Textbox( | |
| label="Plain Language Version", | |
| lines=15, | |
| max_lines=20, | |
| buttons=["copy"] | |
| ) | |
| status_output = gr.Textbox( | |
| label="Status", | |
| lines=2, | |
| interactive=False | |
| ) | |
| similarity_output = gr.Number( | |
| label="Quality Score (0-1, target: ≥0.8)", | |
| precision=4, | |
| interactive=False | |
| ) | |
| with gr.Accordion("🔍 Processing Details", open=False): | |
| details_output = gr.Markdown() | |
| gr.Markdown(""" | |
| ### 💡 Tips for Better Results: | |
| - **Markdown Format**: Structure your document with headers (`#`), bullet points (`-`), and paragraphs for better organization | |
| - **Context Matters**: Providing jurisdiction and jargon terms helps the AI understand legal and regional context | |
| - **Iteration**: The system automatically tries up to 3 times to improve translation quality if similarity is below 0.8 | |
| - **Review Output**: Always review the output for accuracy—AI is a powerful tool but human judgment is essential for legal documents | |
| ### 📚 Markdown Resources: | |
| - [Markdown Guide - Basic Syntax](https://www.markdownguide.org/basic-syntax/) | |
| - [Google's Markdown Style Guide](https://google.github.io/styleguide/docguide/style.html) | |
| - [Markdown Best Practices for Documentation](https://www.markdowntoolbox.com/blog/markdown-best-practices-for-documentation/) | |
| ### 🔧 Technical Details: | |
| - **Workflow**: Inspired by [Seattle Language Access Program](https://codeforamerica.org/news/government-gains-with-the-city-of-seattle/) | |
| - **AI Model**: Groq Compound (efficient language model) | |
| - **Translation**: LibreTranslate (en→zh→en for quality verification) | |
| - **Embeddings**: Qwen3-Embedding-0.6B (semantic similarity calculation) | |
| - **Quality Metric**: Cosine similarity between original and back-translated text | |
| --- | |
| **⚠️ Important Disclaimer**: This tool is a demo using AI models for translation and may not be accurate. Always have documents reviewed by qualified professionals for your real world use case. | |
| """) | |
| # Event handler | |
| translate_btn.click( | |
| fn=process_document, | |
| inputs=[policy_input, jurisdiction_input, jargon_input], | |
| outputs=[plain_output, status_output, similarity_output, details_output] | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch(theme=gr.themes.Soft()) |