Spaces:

destinesiastudio
/

data-faker

Build error

App Files Files Community

mayzyo commited on Mar 7, 2024

Commit

185bb55

1 Parent(s): 91b0724

:tada: Initial commit

Browse files

Files changed (6) hide show

.gitignore +171 -0
README.md +66 -0
app.py +71 -0
data_faker/__init__.py +0 -0
data_faker/anonymiser_manager.py +33 -0
requirements.txt +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,171 @@

+data/
+vad_chunks/
+uploads/
+output/
+flagged/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+Include/
+Scripts/
+share/
+pyvenv.cfg
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+envs/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

README.md CHANGED Viewed

@@ -11,3 +11,69 @@ license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Data Faker
+Data Faker is a tool designed to protect privacy and maintain confidentiality by anonymizing sensitive information in text data before passing it to language models like GPT-4. It provides a user-friendly interface for anonymizing and deanonymizing text using the Presidio Reversible Anonymizer.
+## Features
+- Anonymize sensitive information such as names, email addresses, phone numbers, and more
+- Deanonymize previously anonymized text to recover the original information
+- Maintain a separate anonymizer instance for each user session
+- Simple and intuitive web-based interface using Gradio
+## Installation
+1. Clone the repository:
+```
+git clone https://github.com/mayzyo/data-faker.git
+```
+2. Navigate to the project directory:
+```
+cd data-faker
+```
+3. Install the required dependencies:
+```
+pip install -r requirements.txt
+```
+## Usage
+1. Run the application:
+```
+python app.py
+```
+2. Open your web browser and navigate to the provided URL (e.g., `http://localhost:7860`).
+3. Enter the text you want to anonymize in the "Text to Convert" input box.
+4. Click the "Generate Fake" button to anonymize the text. The anonymized text will appear in the "Output Text" box.
+5. To deanonymize the text, click the "Revert Fake" button. The original text will be restored in the "Output Text" box.
+6. To reset the anonymizer instance and clear the input and output boxes, click the "Reset" button.
+## Configuration
+The `AnonymiserManager` class in `anonymiser_manager.py` uses the Presidio Reversible Anonymizer to handle the anonymization and deanonymization process. You can customize the `analyzed_fields` parameter to specify the types of sensitive information you want to anonymize.
+## Contributing
+Contributions are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request.
+## License
+This project is licensed under the [MIT License](LICENSE).
+## Acknowledgements
+- [Presidio](https://github.com/microsoft/presidio) - Data anonymization library
+- [Gradio](https://gradio.app/) - Web-based interface library

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import os
+import gradio as gr
+from uuid import UUID
+from data_faker.anonymiser_manager import AnonymiserManager
+# device = 0 if torch.cuda.is_available() else "cpu"
+theme = gr.themes.Base(
+    font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
+)
+manager = AnonymiserManager()
+def anonymise_text(id: UUID, input_text: str):
+    if input_text is None:
+        raise gr.Error("No text found! Please write something in the input box first.")
+    if id == None:
+        gr.Info('Using a new anonymiser instance')
+    else:
+        gr.Info(f'Running anonymise on existing instance ({id})')
+    anon_text, id = manager.add(id, input_text)
+    return id, anon_text
+def deanonymise_text(id: UUID, input_text: str):
+    if input_text is None:
+        raise gr.Error("No text found! Please write something in the input box first.")
+    if id == None:
+        raise gr.Error("Unable to deanonymise without anonymising something first.")
+    deanon_text = manager.revert(id, input_text)
+    return deanon_text
+def clear_instance(id: UUID):
+    if id is None:
+        raise gr.Error("No anonymising recorded yet, nothing to clear.")
+    if manager.clear(id) == True:
+        gr.Info(f'Anonymiser instance remove successfully ({id})')
+    else:
+        gr.Info(f'Anonymiser instance ({id}) cannot be found')
+    return None, '', ''
+with gr.Blocks() as text_to_text:
+    state = gr.State()
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_text = gr.TextArea(label="Text to Convert", show_copy_button=True)
+            with gr.Row():
+                anon_btn = gr.Button("Generate Fake", variant="primary")
+                denon_btn = gr.Button("Revert Fake", variant="primary")
+        with gr.Column(scale=1):
+            output_text = gr.TextArea(label="Output Text", show_copy_button=True)
+            clear_btn = gr.Button("Reset")
+        anon_btn.click(anonymise_text, inputs=[state, input_text], outputs=[state, output_text], concurrency_limit=4)
+        denon_btn.click(deanonymise_text, inputs=[state, input_text], outputs=[output_text], concurrency_limit=4)
+        clear_btn.click(clear_instance, inputs=[state], outputs=[state, input_text, output_text], concurrency_limit=4)
+with gr.Blocks(title="Data Faker", theme=theme) as demo:
+    gr.Markdown("""# Data Faker
+        A crucial data faking tool before passing information to a language model like GPT-4 to help protect privacy and maintain confidentiality""")
+    gr.TabbedInterface(
+        [text_to_text],
+        ["Text to Text"]
+    )
+if __name__ == "__main__":
+    demo.queue(api_open=True).launch(show_api=True)

data_faker/__init__.py ADDED Viewed

File without changes

data_faker/anonymiser_manager.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
+from uuid import uuid5, UUID, NAMESPACE_DNS
+class AnonymiserManager:
+    mapping = {}
+    def add(self, id: UUID, text: str):
+        if id in self.mapping.keys():
+            anonymiser = self.mapping[id]
+        else:
+            anonymiser = PresidioReversibleAnonymizer(
+                analyzed_fields=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'IBAN_CODE', 'CREDIT_CARD', 'CRYPTO', 'IP_ADDRESS', 'LOCATION', 'NRP', 'MEDICAL_LICENSE', 'URL', 'US_BANK_NUMBER', 'US_DRIVER_LICENSE', 'US_ITIN', 'US_PASSPORT', 'US_SSN']
+            )
+            id = uuid5(NAMESPACE_DNS, 'destinesiastudio.com.au')
+            self.mapping[id] = anonymiser
+        anon_text = anonymiser.anonymize(text, language="en")
+        return anon_text, id
+    def revert(self, id: UUID, text: str) -> str:
+        if id in self.mapping.keys():
+            anonymiser: PresidioReversibleAnonymizer = self.mapping[id]
+            deanon_text = anonymiser.deanonymize(text)
+            return deanon_text
+    def clear(self, id: UUID) -> bool:
+        if id in self.mapping.keys():
+            del self.mapping[id]
+            return True
+        return False

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio==4.20.1
+langchain-experimental==0.0.53
+presidio-analyzer==2.2.353
+presidio-anonymizer==2.2.353
+spacy==3.7.4
+faker==24.0.0