AI-Solutions-KK commited on
Commit
96c0667
·
0 Parent(s):

Initial deployment - Academic Paraphraser with complete functionality

Browse files
.gitignore ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Karan Tatyaso Kamble
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🧪 Engineering Academic Paraphraser (EAP)
2
+
3
+ > **Advanced AI-Powered Academic Writing Assistant for Engineering Domains**
4
+
5
+ [![Python 3.7+](https://img.shields.io/badge/python-3.7+-blue.svg)](https://www.python.org/downloads/)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+ [![Transformers](https://img.shields.io/badge/🤗-Transformers-orange)](https://huggingface.co/transformers/)
8
+ [![Build Status](https://img.shields.io/badge/build-passing-green.svg)]()
9
+
10
+ ## 📋 Table of Contents
11
+ - [Overview](#-overview)
12
+ - [Features](#-features)
13
+ - [Architecture](#-architecture)
14
+ - [Installation](#-installation)
15
+ - [Quick Start](#-quick-start)
16
+ - [Usage Examples](#-usage-examples)
17
+ - [API Documentation](#-api-documentation)
18
+ - [Testing](#-testing)
19
+ - [Performance](#-performance)
20
+ - [Contributing](#-contributing)
21
+ - [License](#-license)
22
+
23
+ ## 🔬 Overview
24
+
25
+ The **Engineering Academic Paraphraser** is a sophisticated AI-powered tool designed specifically for academic and technical writing in engineering domains. It combines state-of-the-art natural language processing with domain-specific knowledge to provide intelligent paraphrasing while preserving technical accuracy and meaning.
26
+
27
+ ### 🎯 Key Objectives
28
+ - **Preserve Technical Accuracy**: Maintains engineering terminology and concepts
29
+ - **Enhance Writing Quality**: Improves readability and academic style
30
+ - **Reduce Similarity**: Helps avoid plagiarism while retaining original meaning
31
+ - **Multi-Domain Support**: Covers Mechanical, Electrical, Computer Science, and Civil Engineering
32
+
33
+ ## ✨ Features
34
+
35
+ ### 🚀 Core Components
36
+
37
+ | Component | Description | Technology |
38
+ |-----------|-------------|------------|
39
+ | **🤖 Academic Paraphraser** | T5-based neural paraphrasing | Transformer Architecture |
40
+ | **🔍 Plagiarism Remover** | Rule-based similarity reduction | NLP + Linguistics |
41
+ | **📊 Quality Checker** | Comprehensive assessment | Multi-metric Analysis |
42
+
43
+ ### 🛠️ Advanced Capabilities
44
+
45
+ - **🎓 Domain-Specific Processing**
46
+ - Mechanical Engineering terminology preservation
47
+ - Electrical Engineering concept handling
48
+ - Computer Science algorithm descriptions
49
+ - Civil Engineering technical language
50
+
51
+ - **📝 Intelligent Text Processing**
52
+ - Synonym replacement with context awareness
53
+ - Sentence restructuring while preserving meaning
54
+ - Technical term identification and protection
55
+ - Academic style enhancement
56
+
57
+ - **📈 Quality Assessment**
58
+ - Similarity analysis (lexical & structural)
59
+ - Readability scoring
60
+ - Word variety metrics
61
+ - Length appropriateness checking
62
+
63
+ - **⚡ Performance Optimized**
64
+ - Lightweight T5-small model for testing
65
+ - Efficient rule-based processing
66
+ - Comprehensive error handling
67
+ - Scalable architecture
68
+
69
+ ## 🏗️ Architecture
70
+
71
+ ```mermaid
72
+ graph TB
73
+ A[Input Text] --> B[Domain Detection]
74
+ B --> C{Processing Pipeline}
75
+
76
+ C --> D[Academic Paraphraser]
77
+ C --> E[Plagiarism Remover]
78
+
79
+ D --> F[Technical Term Preservation]
80
+ E --> G[Rule-Based Transformation]
81
+
82
+ F --> H[Quality Assessment]
83
+ G --> H
84
+
85
+ H --> I[Similarity Analysis]
86
+ H --> J[Readability Check]
87
+ H --> K[Vocabulary Assessment]
88
+
89
+ I --> L[Final Output]
90
+ J --> L
91
+ K --> L
92
+
93
+ L --> M[Quality Score]
94
+ L --> N[Processed Text]
95
+ L --> O[Recommendations]
96
+ ```
97
+
98
+ ## 🚀 Installation
99
+
100
+ ### Prerequisites
101
+ - Python 3.7+
102
+ - PyTorch
103
+ - Transformers library
104
+ - NLTK
105
+ - SpaCy
106
+
107
+ ### Method 1: Clone Repository
108
+ ```bash
109
+ git clone https://github.com/yourusername/engineering-academic-paraphraser.git
110
+ cd engineering-academic-paraphraser
111
+ pip install -r requirements.txt
112
+ ```
113
+
114
+ ### Method 2: Google Colab Setup
115
+ ```python
116
+ # Mount Google Drive
117
+ from google.colab import drive
118
+ drive.mount('/content/drive')
119
+
120
+ # Clone repository
121
+ !git clone https://github.com/yourusername/engineering-academic-paraphraser.git
122
+ %cd engineering-academic-paraphraser
123
+
124
+ # Install dependencies
125
+ !pip install -q transformers torch nltk spacy textstat sentence-transformers
126
+ !python -m spacy download en_core_web_sm
127
+ ```
128
+
129
+ ### Required Packages
130
+ ```bash
131
+ pip install transformers>=4.0.0
132
+ pip install torch>=1.7.0
133
+ pip install nltk>=3.6
134
+ pip install spacy>=3.4.0
135
+ pip install textstat>=0.7.0
136
+ pip install sentence-transformers>=2.2.0
137
+ pip install numpy pandas scipy scikit-learn
138
+ ```
139
+
140
+ ## 🚀 Quick Start
141
+
142
+ ### Basic Usage
143
+ ```python
144
+ from models.model1_paraphraser import AcademicParaphraser
145
+ from models.model2_plagiarism_remover import PlagiarismRemover
146
+ from models.utils.quality_checker import QualityChecker
147
+
148
+ # Initialize components
149
+ paraphraser = AcademicParaphraser()
150
+ plagiarism_remover = PlagiarismRemover()
151
+ quality_checker = QualityChecker()
152
+
153
+ # Sample text
154
+ text = """The mechanical transmission system utilizes advanced gear
155
+ mechanisms to achieve optimal torque distribution."""
156
+
157
+ # Generate paraphrases
158
+ results = paraphraser.paraphrase(text, domain="mechanical", num_variants=3)
159
+
160
+ # Remove plagiarism indicators
161
+ processed = plagiarism_remover.remove_plagiarism(
162
+ text, domain="mechanical", aggressiveness="medium"
163
+ )
164
+
165
+ # Assess quality
166
+ quality = quality_checker.comprehensive_quality_check(
167
+ text, processed['processed_text'], domain="mechanical"
168
+ )
169
+
170
+ print(f"Quality Score: {quality['overall_score']:.1f}%")
171
+ ```
172
+
173
+ ## 📚 Usage Examples
174
+
175
+ ### Example 1: Mechanical Engineering
176
+ ```python
177
+ # Input
178
+ original = """The stress analysis reveals significant strain concentrations
179
+ at critical junction points, requiring enhanced material properties."""
180
+
181
+ # Process
182
+ result = plagiarism_remover.remove_plagiarism(original, "mechanical", "high")
183
+
184
+ # Output
185
+ print(result['processed_text'])
186
+ # "The stress examination demonstrates considerable strain accumulation
187
+ # at vital connection locations, necessitating improved material characteristics."
188
+ ```
189
+
190
+ ### Example 2: Computer Science
191
+ ```python
192
+ # Input
193
+ original = """The algorithm implementation utilizes efficient data structures
194
+ to optimize computational complexity."""
195
+
196
+ # Generate variants
197
+ variants = paraphraser.paraphrase(original, "computer_science", 2)
198
+
199
+ for variant in variants:
200
+ print(f"Variant {variant['variant_id']}: {variant['paraphrased_text']}")
201
+ print(f"Confidence: {variant['confidence_score']:.2f}")
202
+ ```
203
+
204
+ ### Example 3: Quality Assessment
205
+ ```python
206
+ # Comprehensive quality check
207
+ original = "The electrical circuit demonstrates high impedance characteristics."
208
+ paraphrased = "This electrical network exhibits elevated impedance properties."
209
+
210
+ quality = quality_checker.comprehensive_quality_check(original, paraphrased)
211
+
212
+ print(f"Overall Score: {quality['overall_score']:.1f}%")
213
+ print(f"Similarity: {quality['detailed_scores']['similarity']['overall_similarity']:.3f}")
214
+ print(f"Recommendations: {quality['recommendations']}")
215
+ ```
216
+
217
+ ## 📖 API Documentation
218
+
219
+ ### AcademicParaphraser Class
220
+
221
+ #### `paraphrase(text, domain="general", num_variants=3)`
222
+ Generates multiple paraphrased versions of input text.
223
+
224
+ **Parameters:**
225
+ - `text` (str): Input text to paraphrase
226
+ - `domain` (str): Engineering domain ('mechanical', 'electrical', 'computer_science', 'civil')
227
+ - `num_variants` (int): Number of variants to generate
228
+
229
+ **Returns:**
230
+ - List of dictionaries containing paraphrased variants with metadata
231
+
232
+ #### `extract_technical_terms(text, domain)`
233
+ Identifies and extracts technical terms for preservation.
234
+
235
+ ### PlagiarismRemover Class
236
+
237
+ #### `remove_plagiarism(text, domain="general", aggressiveness="medium")`
238
+ Applies transformations to reduce text similarity.
239
+
240
+ **Parameters:**
241
+ - `text` (str): Input text to process
242
+ - `domain` (str): Engineering domain
243
+ - `aggressiveness` (str): Processing intensity ('low', 'medium', 'high')
244
+
245
+ **Returns:**
246
+ - Dictionary with processed text and transformation metadata
247
+
248
+ ### QualityChecker Class
249
+
250
+ #### `comprehensive_quality_check(original_text, paraphrased_text, domain="general")`
251
+ Performs detailed quality assessment.
252
+
253
+ **Returns:**
254
+ - Comprehensive quality metrics and recommendations
255
+
256
+ ## 🧪 Testing
257
+
258
+ ### Run Comprehensive Tests
259
+ ```python
260
+ # Import test runner
261
+ from tests.comprehensive_test import TestRunner
262
+
263
+ # Initialize and run tests
264
+ test_runner = TestRunner()
265
+ results = test_runner.run_all_tests()
266
+
267
+ # View results
268
+ print(f"Overall Success Rate: {sum(r.get('success_rate', 0) for r in results.values()) / len(results):.1f}%")
269
+ ```
270
+
271
+ ### Test Categories
272
+ - ✅ **Import Tests**: Verify all components load correctly
273
+ - ✅ **Initialization Tests**: Check model loading and setup
274
+ - ✅ **Functionality Tests**: Validate core processing capabilities
275
+ - ✅ **Pipeline Tests**: Test end-to-end processing
276
+ - ✅ **Error Handling**: Verify graceful error management
277
+ - ✅ **Performance Tests**: Check processing speed and efficiency
278
+
279
+ ### Sample Test Results
280
+ ```
281
+ 🧪 COMPREHENSIVE TEST RESULTS
282
+ ════════════════════════════════════════
283
+ ✅ IMPORTS: 3/3 passed (100.0%)
284
+ ✅ INITIALIZATION: 3/3 passed (100.0%)
285
+ ✅ BASIC_FUNCTIONALITY: 3/3 passed (100.0%)
286
+ ✅ PIPELINE: 4/4 passed (100.0%)
287
+ ✅ ERROR_HANDLING: 4/4 passed (100.0%)
288
+ ✅ PERFORMANCE: 1/1 passed (100.0%)
289
+
290
+ 🎯 OVERALL RESULT: 18/18 tests passed (100.0%)
291
+ 🎉 EXCELLENT! Ready for deployment
292
+ ```
293
+
294
+ ## ⚡ Performance
295
+
296
+ ### Benchmarks
297
+ | Component | Processing Time | Memory Usage | Accuracy |
298
+ |-----------|----------------|--------------|----------|
299
+ | Plagiarism Remover | ~0.1s per 100 words | < 50MB | 85-90% |
300
+ | Quality Checker | ~0.05s per assessment | < 30MB | 90-95% |
301
+ | T5 Paraphraser | ~2-5s per variant | 200-500MB | 80-90% |
302
+
303
+ ### Optimization Features
304
+ - 🚀 **Lightweight Models**: T5-small for faster processing
305
+ - ⚡ **Efficient Algorithms**: Optimized rule-based transformations
306
+ - 💾 **Memory Management**: Minimal resource usage
307
+ - 🔄 **Batch Processing**: Support for multiple texts
308
+
309
+ ## 🗂️ Project Structure
310
+
311
+ ```
312
+ engineering-academic-paraphraser/
313
+
314
+ ├── models/
315
+ │ ├── __init__.py
316
+ │ ├── model1_paraphraser.py # T5-based paraphrasing
317
+ │ ├── model2_plagiarism_remover.py # Rule-based processing
318
+ │ └── utils/
319
+ │ ├── __init__.py
320
+ │ └── quality_checker.py # Quality assessment
321
+
322
+ ├── tests/
323
+ │ ├── __init__.py
324
+ │ └── comprehensive_test.py # Complete test suite
325
+
326
+ ├── examples/
327
+ │ ├── basic_usage.py
328
+ │ ├── domain_specific_examples.py
329
+ │ └── batch_processing.py
330
+
331
+ ├── docs/
332
+ │ ├── api_reference.md
333
+ │ ├── user_guide.md
334
+ │ └── technical_details.md
335
+
336
+ ├── requirements.txt
337
+ ├── setup.py
338
+ ├── README.md
339
+ └── LICENSE
340
+ ```
341
+
342
+ ## 🤝 Contributing
343
+
344
+ We welcome contributions! Please follow these steps:
345
+
346
+ 1. **Fork the Repository**
347
+ 2. **Create Feature Branch**
348
+ ```bash
349
+ git checkout -b feature/amazing-feature
350
+ ```
351
+ 3. **Commit Changes**
352
+ ```bash
353
+ git commit -m 'Add amazing feature'
354
+ ```
355
+ 4. **Push to Branch**
356
+ ```bash
357
+ git push origin feature/amazing-feature
358
+ ```
359
+ 5. **Open Pull Request**
360
+
361
+ ### Development Guidelines
362
+ - Follow PEP 8 style guidelines
363
+ - Add comprehensive tests for new features
364
+ - Update documentation as needed
365
+ - Maintain backward compatibility
366
+
367
+ ## 🐛 Known Issues & Limitations
368
+
369
+ - **T5 Model**: May require significant memory (>2GB RAM)
370
+ - **Processing Speed**: T5 inference can be slow on CPU
371
+ - **Domain Coverage**: Currently optimized for 4 engineering domains
372
+ - **Language Support**: English only at present
373
+
374
+ ## 🛠️ Troubleshooting
375
+
376
+ ### Common Issues
377
+
378
+ #### Import Errors
379
+ ```python
380
+ # If you encounter import errors, try:
381
+ import sys
382
+ sys.path.append('/path/to/project')
383
+ ```
384
+
385
+ #### Memory Issues with T5
386
+ ```python
387
+ # Use smaller model variant:
388
+ paraphraser = AcademicParaphraser(model_name="t5-small")
389
+ ```
390
+
391
+ #### NLTK Data Missing
392
+ ```python
393
+ import nltk
394
+ nltk.download('punkt')
395
+ nltk.download('stopwords')
396
+ ```
397
+
398
+ ## 📞 Support
399
+
400
+ - **Documentation**: [Full API Reference](docs/api_reference.md)
401
+ - **Examples**: See `examples/` directory
402
+ - **Issues**: [GitHub Issues](https://github.com/yourusername/engineering-academic-paraphraser/issues)
403
+ - **Discussions**: [GitHub Discussions](https://github.com/yourusername/engineering-academic-paraphraser/discussions)
404
+
405
+ ## 📜 License
406
+
407
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
408
+
409
+ ## 🏆 Acknowledgments
410
+
411
+ - **Hugging Face Transformers** for the T5 model implementation
412
+ - **NLTK & SpaCy** for natural language processing tools
413
+ - **PyTorch** for deep learning framework
414
+ - **Engineering Community** for domain-specific insights
415
+
416
+ ## 📊 Citation
417
+
418
+ If you use this work in your research, please cite:
419
+
420
+ ```bibtex
421
+ @software{engineering_academic_paraphraser,
422
+ title={Engineering Academic Paraphraser: AI-Powered Writing Assistant for Technical Domains},
423
+ author={Your Name},
424
+ year={2024},
425
+ url={https://github.com/yourusername/engineering-academic-paraphraser}
426
+ }
427
+ ```
428
+
429
+ ---
430
+
431
+ <div align="center">
432
+
433
+ **🌟 Star this repository if you find it helpful! 🌟**
434
+
435
+ Made with ❤️ for the Engineering Academic Community
436
+
437
+ [![GitHub stars](https://img.shields.io/github/stars/yourusername/engineering-academic-paraphraser?style=social)](https://github.com/yourusername/engineering-academic-paraphraser/stargazers)
438
+ [![GitHub forks](https://img.shields.io/github/forks/yourusername/engineering-academic-paraphraser?style=social)](https://github.com/yourusername/engineering-academic-paraphraser/network/members)
439
+
440
+ </div>
app.py ADDED
@@ -0,0 +1,463 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # FILE: app.py (HuggingFace Spaces entry point)
3
+ # =============================================
4
+
5
+ #!/usr/bin/env python3
6
+ """
7
+ HuggingFace Spaces deployment entry point for Engineering Academic Paraphraser
8
+ """
9
+
10
+ import sys
11
+ import os
12
+ from pathlib import Path
13
+
14
+ # Add current directory to Python path
15
+ current_dir = Path(__file__).parent
16
+ sys.path.append(str(current_dir))
17
+
18
+ # Import and run the main application
19
+ try:
20
+ from frontend_backend.main import main
21
+
22
+ if __name__ == "__main__":
23
+ main()
24
+
25
+ except ImportError as e:
26
+ import streamlit as st
27
+ st.error(f"❌ Import Error: {e}")
28
+ st.error("Please check the file structure and dependencies")
29
+ st.info("This app requires the complete project structure to function properly")
30
+
31
+ # FILE: README.md
32
+ # ===============
33
+
34
+ # 🔬 Engineering Academic Paraphraser
35
+
36
+ Professional AI-powered paraphrasing and plagiarism removal tools specifically designed for engineering research, academic papers, and technical documentation.
37
+
38
+ ## 🎯 Features
39
+
40
+ ### 📝 Academic Paraphraser
41
+ - **Intelligent Paraphrasing**: Advanced T5-based model for high-quality text rewriting
42
+ - **Technical Term Preservation**: Maintains engineering terminology and domain-specific vocabulary
43
+ - **Citation Protection**: Preserves academic references and citations
44
+ - **Quality Metrics**: Real-time assessment of paraphrase quality and semantic similarity
45
+
46
+ ### 🛡️ Plagiarism Remover
47
+ - **Advanced Originality**: Deep text transformation for maximum uniqueness
48
+ - **Risk Assessment**: Real-time plagiarism risk analysis
49
+ - **Multiple Techniques**: Combines rule-based and neural approaches
50
+ - **Academic Integrity**: Maintains technical accuracy while ensuring originality
51
+
52
+ ## 🚀 Quick Start
53
+
54
+ ### Local Installation
55
+
56
+ 1. **Clone the repository:**
57
+ ```bash
58
+ git clone https://github.com/yourusername/engineering-paraphraser.git
59
+ cd engineering-paraphraser
60
+ ```
61
+
62
+ 2. **Install dependencies:**
63
+ ```bash
64
+ pip install -r requirements.txt
65
+ ```
66
+
67
+ 3. **Download required models:**
68
+ ```bash
69
+ python -c "import nltk; nltk.download('punkt'); nltk.download('stopwords')"
70
+ ```
71
+
72
+ 4. **Run the application:**
73
+ ```bash
74
+ streamlit run frontend_backend/main.py
75
+ ```
76
+
77
+ ### Cloud Deployment (HuggingFace Spaces)
78
+
79
+ 1. **Create a new Space on HuggingFace**
80
+ 2. **Upload all project files maintaining the directory structure**
81
+ 3. **Set Space SDK to "Streamlit"**
82
+ 4. **The app will automatically deploy**
83
+
84
+ ## 📁 Project Structure
85
+
86
+ ```
87
+ engineering-paraphraser/
88
+ ├── models/
89
+ │ ├── __init__.py
90
+ │ ├── model1_paraphraser.py # Academic Paraphraser
91
+ │ ├── model2_plagiarism_remover.py # Plagiarism Remover
92
+ │ └── utils/
93
+ │ ├── __init__.py
94
+ │ ├── text_processor.py # Text preprocessing utilities
95
+ │ ├── quality_checker.py # Quality assessment tools
96
+ │ └── engineering_terms.py # Engineering domain terms
97
+ ├── frontend_backend/
98
+ │ └── main.py # Streamlit GUI application
99
+ ├── config/
100
+ │ ├── requirements.txt # Python dependencies
101
+ │ └── model_config.py # Configuration settings
102
+ ├── docs/
103
+ │ ├── README.md # This file
104
+ │ ├── documentation.md # Detailed documentation
105
+ │ └── usage_examples.ipynb # Jupyter notebook examples
106
+ ├── tests/
107
+ │ └── test_models.py # Unit tests
108
+ ├── app.py # HuggingFace Spaces entry point
109
+ └── packages.txt # System dependencies
110
+ ```
111
+
112
+ ## 🎛️ Usage Guide
113
+
114
+ ### For Academic Paraphrasing:
115
+ 1. Input your research text
116
+ 2. Select "Academic Paraphraser"
117
+ 3. Adjust creativity level (0.1-1.0)
118
+ 4. Enable technical term preservation
119
+ 5. Generate multiple variants
120
+ 6. Review quality metrics
121
+
122
+ ### For Plagiarism Removal:
123
+ 1. Input text requiring originality
124
+ 2. Select "Plagiarism Remover"
125
+ 3. Set modification intensity
126
+ 4. Preserve citations and technical terms
127
+ 5. Generate unique variants
128
+ 6. Check uniqueness scores
129
+
130
+ ## 🔧 Configuration
131
+
132
+ ### Model Settings
133
+ - **Paraphraser Model**: T5-Small (77M parameters)
134
+ - **Plagiarism Model**: DistilBERT + Custom algorithms
135
+ - **Sentence Model**: all-MiniLM-L6-v2
136
+ - **Max Length**: 512 tokens
137
+ - **Similarity Threshold**: 0.7
138
+
139
+ ### Engineering Domains Supported
140
+ - Mechanical Engineering
141
+ - Electrical Engineering
142
+ - Computer Science
143
+ - Civil Engineering
144
+ - Chemical Engineering
145
+ - Biomedical Engineering
146
+
147
+ ## 🧪 Testing
148
+
149
+ Run the test suite:
150
+ ```bash
151
+ python -m pytest tests/
152
+ ```
153
+
154
+ Test individual models:
155
+ ```bash
156
+ python models/model1_paraphraser.py
157
+ python models/model2_plagiarism_remover.py
158
+ ```
159
+
160
+ ## 📊 Performance Metrics
161
+
162
+ ### Quality Indicators:
163
+ - **Semantic Similarity**: 0.7-0.9 (optimal range)
164
+ - **Lexical Diversity**: >0.3 (good variation)
165
+ - **Length Preservation**: 0.8-1.2 (appropriate length)
166
+ - **Uniqueness Score**: >0.8 (low plagiarism risk)
167
+
168
+ ## 🤝 Contributing
169
+
170
+ 1. Fork the repository
171
+ 2. Create a feature branch
172
+ 3. Make your changes
173
+ 4. Add tests for new functionality
174
+ 5. Submit a pull request
175
+
176
+ ## 📄 License
177
+
178
+ MIT License - see LICENSE file for details
179
+
180
+ ## 🔗 Links
181
+
182
+ - **Live Demo**: [HuggingFace Spaces](https://huggingface.co/spaces/yourusername/engineering-paraphraser)
183
+ - **Documentation**: [Full Documentation](docs/documentation.md)
184
+ - **Issues**: [GitHub Issues](https://github.com/yourusername/engineering-paraphraser/issues)
185
+
186
+ ## 🆘 Support
187
+
188
+ For support and questions:
189
+ - Open an issue on GitHub
190
+ - Check the documentation
191
+ - Review the example notebooks
192
+
193
+ ## 🏷️ Version
194
+
195
+ Current Version: **1.0.0**
196
+
197
+ ---
198
+
199
+ **⚠️ Important Notice**: This tool is designed to assist academic writing and research. Always review generated content for accuracy and appropriateness. Users are responsible for ensuring compliance with their institution's academic integrity policies.
200
+
201
+ # FILE: documentation.md
202
+ # =====================
203
+
204
+ # 📚 Engineering Academic Paraphraser - Technical Documentation
205
+
206
+ ## 🏗️ Architecture Overview
207
+
208
+ The Engineering Academic Paraphraser is built on a modular architecture that separates concerns and enables scalable, maintainable code.
209
+
210
+ ### Core Components
211
+
212
+ #### 1. Model Layer (`models/`)
213
+ - **model1_paraphraser.py**: T5-based academic paraphrasing engine
214
+ - **model2_plagiarism_remover.py**: Advanced plagiarism detection and removal
215
+ - **utils/**: Shared utilities for text processing and quality assessment
216
+
217
+ #### 2. Frontend Layer (`frontend_backend/`)
218
+ - **main.py**: Streamlit-based user interface
219
+ - Interactive controls and real-time feedback
220
+ - Quality metrics visualization
221
+
222
+ #### 3. Configuration Layer (`config/`)
223
+ - **model_config.py**: Centralized configuration management
224
+ - Model parameters and domain-specific settings
225
+ - Processing thresholds and quality metrics
226
+
227
+ ## 🔬 Technical Details
228
+
229
+ ### Model 1: Academic Paraphraser
230
+
231
+ **Technology Stack:**
232
+ - **Base Model**: T5-Small (Text-to-Text Transfer Transformer)
233
+ - **Framework**: HuggingFace Transformers
234
+ - **Preprocessing**: NLTK + spaCy
235
+ - **Quality Assessment**: Sentence Transformers
236
+
237
+ **Key Features:**
238
+ - Semantic similarity preservation (0.7-0.9 range)
239
+ - Technical terminology protection
240
+ - Citation and reference preservation
241
+ - Multi-variant generation
242
+ - Real-time quality scoring
243
+
244
+ **Processing Pipeline:**
245
+ 1. **Input Preprocessing**: Clean and tokenize text
246
+ 2. **Term Protection**: Identify and preserve technical terms
247
+ 3. **Citation Extraction**: Preserve academic references
248
+ 4. **T5 Processing**: Generate paraphrased variants
249
+ 5. **Quality Filtering**: Assess semantic similarity and fluency
250
+ 6. **Post-processing**: Restore protected elements
251
+
252
+ ### Model 2: Plagiarism Remover
253
+
254
+ **Technology Stack:**
255
+ - **Primary Models**: DistilBERT + T5-Small
256
+ - **Analysis Tools**: TF-IDF Vectorization + Cosine Similarity
257
+ - **Enhancement**: Rule-based transformation algorithms
258
+ - **Validation**: Multi-metric originality assessment
259
+
260
+ **Key Features:**
261
+ - Plagiarism risk assessment (0.0-1.0 scale)
262
+ - Advanced sentence restructuring
263
+ - Voice conversion (active ↔ passive)
264
+ - Contextual synonym replacement
265
+ - Phrase uniqueness optimization
266
+
267
+ **Transformation Techniques:**
268
+ 1. **Semantic Restructuring**: Deep sentence reorganization
269
+ 2. **Lexical Substitution**: Context-aware synonym replacement
270
+ 3. **Syntactic Transformation**: Grammar pattern modification
271
+ 4. **Discourse Reordering**: Clause and phrase rearrangement
272
+
273
+ ## 🎯 Quality Assurance
274
+
275
+ ### Metrics and Thresholds
276
+
277
+ #### Paraphraser Quality Metrics:
278
+ - **Semantic Similarity**: 0.6-0.95 (too low = meaning loss, too high = insufficient change)
279
+ - **Lexical Diversity**: >0.15 (proportion of changed words)
280
+ - **Length Preservation**: 0.7-1.5 (relative length ratio)
281
+ - **Academic Quality**: Boolean check for academic language patterns
282
+
283
+ #### Plagiarism Removal Metrics:
284
+ - **Uniqueness Score**: >0.8 (1.0 - plagiarism_risk)
285
+ - **Phrase Originality**: >0.7 (proportion of unique phrases)
286
+ - **Semantic Preservation**: >0.6 (maintain original meaning)
287
+ - **Technical Accuracy**: Preserved domain terminology
288
+
289
+ ### Quality Control Pipeline
290
+
291
+ ```python
292
+ def quality_assessment_pipeline(original, processed):
293
+ """Multi-dimensional quality assessment"""
294
+
295
+ # Semantic similarity check
296
+ similarity = calculate_similarity(original, processed)
297
+
298
+ # Lexical diversity analysis
299
+ diversity = analyze_lexical_changes(original, processed)
300
+
301
+ # Academic pattern preservation
302
+ academic_quality = check_academic_patterns(processed)
303
+
304
+ # Technical term integrity
305
+ term_preservation = verify_technical_terms(original, processed)
306
+
307
+ return QualityScore(similarity, diversity, academic_quality, term_preservation)
308
+ ```
309
+
310
+ ## 🔧 Configuration Management
311
+
312
+ ### Model Configuration
313
+
314
+ ```python
315
+ class ModelConfig:
316
+ # Core model settings
317
+ PARAPHRASER_MODEL = "t5-small" # 77M parameters
318
+ PLAGIARISM_MODEL = "distilbert-base" # 66M parameters
319
+ SENTENCE_MODEL = "all-MiniLM-L6-v2" # 22M parameters
320
+
321
+ # Processing parameters
322
+ MAX_LENGTH = 512 # Token limit
323
+ MIN_SIMILARITY_THRESHOLD = 0.7 # Quality threshold
324
+ BATCH_SIZE = 8 # Processing batch size
325
+
326
+ # Domain-specific settings
327
+ PROTECTED_TERMS = [...] # Engineering terminology
328
+ CITATION_PATTERNS = [...] # Academic reference patterns
329
+ ```
330
+
331
+ ### Engineering Domain Specialization
332
+
333
+ The system includes specialized handling for engineering domains:
334
+
335
+ #### Protected Technical Terms:
336
+ - **General Engineering**: algorithm, methodology, optimization, simulation
337
+ - **Mechanical**: thermodynamics, kinematics, stress analysis
338
+ - **Electrical**: impedance, frequency response, circuit analysis
339
+ - **Computer Science**: data structures, algorithms, complexity analysis
340
+ - **Civil**: structural analysis, load calculations, material properties
341
+
342
+ #### Academic Pattern Recognition:
343
+ - Citation formats: `[1]`, `(Author, 2023)`, `et al.`
344
+ - Figure references: `Figure 1`, `Table 2`, `Equation 3`
345
+ - Technical units: `Hz`, `V`, `MPa`, `kg/m³`
346
+ - Standards: `IEEE`, `ASME`, `ISO`, `ASTM`
347
+
348
+ ## 🚀 Performance Optimization
349
+
350
+ ### Computational Efficiency
351
+
352
+ #### Model Loading Strategy:
353
+ ```python
354
+ @st.cache_resource
355
+ def load_model(model_name):
356
+ """Cached model loading for Streamlit deployment"""
357
+ return pipeline("text2text-generation", model_name, device=-1)
358
+ ```
359
+
360
+ #### Memory Management:
361
+ - **Lazy Loading**: Models loaded only when needed
362
+ - **Batch Processing**: Process multiple sentences efficiently
363
+ - **Caching**: Streamlit resource caching for model persistence
364
+ - **CPU Optimization**: Quantized models for resource-constrained environments
365
+
366
+ #### Processing Speed:
367
+ - **T5-Small**: ~2-3 seconds per paragraph (CPU)
368
+ - **DistilBERT**: ~1-2 seconds per analysis (CPU)
369
+ - **Memory Usage**: ~2-4GB RAM total
370
+ - **Concurrent Users**: 10-20 simultaneous users supported
371
+
372
+ ## 🔒 Security and Privacy
373
+
374
+ ### Data Handling:
375
+ - **No Persistent Storage**: All processing in memory
376
+ - **Session Isolation**: Each user session independent
377
+ - **No External Calls**: Models run locally/on deployment server
378
+ - **Privacy-First**: No text data sent to external APIs
379
+
380
+ ### Academic Integrity:
381
+ - **Transparency**: Clear indication of AI assistance
382
+ - **Quality Metrics**: Visible similarity and uniqueness scores
383
+ - **User Responsibility**: Clear guidelines for appropriate use
384
+ - **Institutional Compliance**: Designed to support academic policies
385
+
386
+ ## 🧪 Testing and Validation
387
+
388
+ ### Test Coverage:
389
+
390
+ #### Unit Tests:
391
+ ```python
392
+ # Test paraphraser functionality
393
+ def test_paraphraser_quality():
394
+ paraphraser = EngineeringParaphraser()
395
+ result = paraphraser.paraphrase_academic_text(test_text)
396
+ assert 0.7 <= calculate_similarity(test_text, result[0]) <= 0.9
397
+
398
+ # Test plagiarism removal
399
+ def test_plagiarism_removal():
400
+ remover = EngineeringPlagiarismRemover()
401
+ result = remover.remove_plagiarism_advanced(test_text)
402
+ uniqueness = remover.get_uniqueness_score(result[0])
403
+ assert uniqueness['uniqueness_score'] >= 0.8
404
+ ```
405
+
406
+ #### Integration Tests:
407
+ - End-to-end processing workflows
408
+ - GUI component functionality
409
+ - File upload/download operations
410
+ - Multi-user session handling
411
+
412
+ #### Performance Tests:
413
+ - Processing speed benchmarks
414
+ - Memory usage profiling
415
+ - Concurrent user simulation
416
+ - Model loading time optimization
417
+
418
+ ## 📈 Monitoring and Analytics
419
+
420
+ ### Quality Metrics Tracking:
421
+ - Real-time quality score calculation
422
+ - Historical performance analysis
423
+ - User interaction patterns
424
+ - Model effectiveness measurement
425
+
426
+ ### Error Handling:
427
+ - Graceful degradation for model failures
428
+ - Fallback processing options
429
+ - Comprehensive error logging
430
+ - User-friendly error messages
431
+
432
+ ## 🔄 Future Development
433
+
434
+ ### Planned Enhancements:
435
+ 1. **Domain-Specific Models**: Fine-tuned models for specific engineering fields
436
+ 2. **Advanced Quality Metrics**: More sophisticated similarity measures
437
+ 3. **Batch Processing**: Multiple document processing
438
+ 4. **API Development**: RESTful API for integration
439
+ 5. **Mobile Optimization**: Responsive design improvements
440
+
441
+ ### Research Directions:
442
+ - **Neural Architecture Search**: Optimized model architectures
443
+ - **Few-Shot Learning**: Rapid domain adaptation
444
+ - **Explainable AI**: Interpretable paraphrasing decisions
445
+ - **Multimodal Processing**: Image and equation handling
446
+
447
+ ---# Create these directories in your GitHub repo:
448
+ models/
449
+ ├── __init__.py
450
+ ├── model1_paraphraser.py
451
+ ├── model2_plagiarism_remover.py
452
+ └── utils/
453
+ ├── __init__.py
454
+ ├── text_processor.py
455
+ ├── quality_checker.py
456
+ └── engineering_terms.py
457
+
458
+ frontend_backend/
459
+ └── main.py
460
+
461
+ config/
462
+ ├── requirements.txt
463
+ └── model_config.py
frontend_backend/main.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FILE: frontend_backend/main.py
2
+ # ===============================
3
+
4
+ import streamlit as st
5
+ import sys
6
+ import os
7
+ from pathlib import Path
8
+ import logging
9
+ import time
10
+ from typing import List, Dict
11
+ import plotly.express as px
12
+ import pandas as pd
13
+
14
+ # Add project root to path for imports
15
+ project_root = Path(__file__).parent.parent
16
+ sys.path.append(str(project_root))
17
+
18
+ try:
19
+ from models.model1_paraphraser import EngineeringParaphraser
20
+ from models.model2_plagiarism_remover import EngineeringPlagiarismRemover
21
+ from models.utils.text_processor import AcademicTextProcessor
22
+ from config.model_config import ModelConfig
23
+ except ImportError as e:
24
+ st.error(f"❌ Import Error: {e}")
25
+ st.error("Please ensure all model files are in the correct directory structure")
26
+ st.stop()
27
+
28
+ # Configure logging
29
+ logging.basicConfig(level=logging.INFO)
30
+ logger = logging.getLogger(__name__)
31
+
32
+ # Page configuration
33
+ st.set_page_config(
34
+ page_title="Engineering Academic Paraphraser",
35
+ page_icon="🔬",
36
+ layout="wide",
37
+ initial_sidebar_state="expanded"
38
+ )
39
+
40
+ # Custom CSS for professional styling
41
+ st.markdown("""
42
+ <style>
43
+ .main-header {
44
+ background: linear-gradient(90deg, #1e3c72, #2a5298);
45
+ padding: 1rem;
46
+ border-radius: 10px;
47
+ color: white;
48
+ text-align: center;
49
+ margin-bottom: 2rem;
50
+ }
51
+ .tool-card {
52
+ border: 2px solid #e0e0e0;
53
+ border-radius: 10px;
54
+ padding: 1rem;
55
+ margin: 1rem 0;
56
+ background: #f8f9fa;
57
+ }
58
+ .quality-metric {
59
+ background: #e8f5e8;
60
+ padding: 0.5rem;
61
+ border-radius: 5px;
62
+ margin: 0.2rem 0;
63
+ }
64
+ .warning-box {
65
+ background: #fff3cd;
66
+ border: 1px solid #ffeaa7;
67
+ padding: 1rem;
68
+ border-radius: 5px;
69
+ margin: 1rem 0;
70
+ }
71
+ .success-box {
72
+ background: #d4edda;
73
+ border: 1px solid #c3e6cb;
74
+ padding: 1rem;
75
+ border-radius: 5px;
76
+ margin: 1rem 0;
77
+ }
78
+ </style>
79
+ """, unsafe_allow_html=True)
80
+
81
+ # Initialize session state
82
+ def initialize_session_state():
83
+ """Initialize all session state variables"""
84
+ if "paraphraser" not in st.session_state:
85
+ st.session_state.paraphraser = None
86
+ if "plagiarism_remover" not in st.session_state:
87
+ st.session_state.plagiarism_remover = None
88
+ if "current_text" not in st.session_state:
89
+ st.session_state.current_text = ""
90
+ if "processed_variants" not in st.session_state:
91
+ st.session_state.processed_variants = []
92
+ if "current_variant_index" not in st.session_state:
93
+ st.session_state.current_variant_index = 0
94
+ if "processing_history" not in st.session_state:
95
+ st.session_state.processing_history = []
96
+ if "quality_metrics" not in st.session_state:
97
+ st.session_state.quality_metrics = {}
98
+
99
+ @st.cache_resource
100
+ def load_models():
101
+ """Load models with caching"""
102
+ try:
103
+ st.info("🔄 Loading AI models... This may take a moment on first run.")
104
+
105
+ # Initialize models
106
+ paraphraser = EngineeringParaphraser()
107
+ plagiarism_remover = EngineeringPlagiarismRemover()
108
+
109
+ # Load models
110
+ paraphraser_loaded = paraphraser.load_model()
111
+ plagiarism_loaded = plagiarism_remover.load_models()
112
+
113
+ if paraphraser_loaded and plagiarism_loaded:
114
+ st.success("✅ All models loaded successfully!")
115
+ return paraphraser, plagiarism_remover
116
+ else:
117
+ st.error("❌ Failed to load some models")
118
+ return None, None
119
+
120
+ except Exception as e:
121
+ st.error(f"❌ Error loading models: {str(e)}")
122
+ return None, None
123
+
124
+ def create_file_handlers():
125
+ """Create file upload and download handlers"""
126
+ st.sidebar.markdown("### 📁 File Operations")
127
+
128
+ # File upload
129
+ uploaded_file = st.sidebar.file_uploader(
130
+ "Upload Document",
131
+ type=['txt', 'docx', 'pdf'],
132
+ help="Upload academic papers, thesis, or research documents"
133
+ )
134
+
135
+ if uploaded_file is not None:
136
+ try:
137
+ if uploaded_file.type == "text/plain":
138
+ content = str(uploaded_file.read(), "utf-8")
139
+ else:
140
+ st.sidebar.warning("For DOCX/PDF files, please copy-paste the text content for now.")
141
+ content = ""
142
+
143
+ if content:
144
+ st.session_state.current_text = content
145
+ st.sidebar.success(f"✅ Loaded {len(content.split())} words")
146
+
147
+ except Exception as e:
148
+ st.sidebar.error(f"❌ Error reading file: {str(e)}")
149
+
150
+ # Download options
151
+ if st.session_state.processed_variants:
152
+ st.sidebar.markdown("### 💾 Download Results")
153
+
154
+ for i, variant in enumerate(st.session_state.processed_variants):
155
+ if st.sidebar.download_button(
156
+ f"📄 Download Variant {i+1}",
157
+ variant,
158
+ file_name=f"processed_variant_{i+1}.txt",
159
+ mime="text/plain"
160
+ ):
161
+ st.sidebar.success(f"Downloaded Variant {i+1}")
162
+
163
+ def create_main_interface():
164
+ """Create the main user interface"""
165
+
166
+ # Header
167
+ st.markdown("""
168
+ <div class="main-header">
169
+ <h1>🔬 Engineering Academic Paraphraser</h1>
170
+ <p>Professional AI-powered paraphrasing and plagiarism removal for engineering research</p>
171
+ </div>
172
+ """, unsafe_allow_html=True)
173
+
174
+ # Main content area
175
+ col1, col2 = st.columns([2, 1])
176
+
177
+ with col1:
178
+ st.markdown("### 📝 Input Text")
179
+
180
+ # Text input
181
+ input_text = st.text_area(
182
+ "Paste your academic text here:",
183
+ value=st.session_state.current_text,
184
+ height=200,
185
+ placeholder="Enter engineering research text, thesis content, or academic papers..."
186
+ )
187
+
188
+ if input_text != st.session_state.current_text:
189
+ st.session_state.current_text = input_text
190
+
191
+ # Word count and basic analysis
192
+ if input_text:
193
+ word_count = len(input_text.split())
194
+ char_count = len(input_text)
195
+ sentences = len([s for s in input_text.split('.') if s.strip()])
196
+
197
+ col_stat1, col_stat2, col_stat3 = st.columns(3)
198
+ col_stat1.metric("Words", word_count)
199
+ col_stat2.metric("Characters", char_count)
200
+ col_stat3.metric("Sentences", sentences)
201
+
202
+ with col2:
203
+ st.markdown("### ⚙️ Processing Options")
204
+
205
+ # Tool selection
206
+ selected_tool = st.selectbox(
207
+ "Choose Processing Tool:",
208
+ ["Academic Paraphraser", "Plagiarism Remover"],
209
+ help="Paraphraser: Improves readability while preserving meaning\nPlagiarism Remover: Maximizes originality and uniqueness"
210
+ )
211
+
212
+ # Advanced settings
213
+ with st.expander("🔧 Advanced Settings"):
214
+ if selected_tool == "Academic Paraphraser":
215
+ creativity = st.slider("Creativity Level", 0.1, 1.0, 0.7, 0.1)
216
+ preserve_terms = st.checkbox("Preserve Technical Terms", value=True)
217
+ preserve_citations = st.checkbox("Preserve Citations", value=True)
218
+ max_variants = st.slider("Number of Variants", 1, 5, 3)
219
+ else:
220
+ aggressiveness = st.slider("Modification Intensity", 0.1, 1.0, 0.8, 0.1)
221
+ preserve_terms = st.checkbox("Preserve Technical Terms", value=True)
222
+ preserve_citations = st.checkbox("Preserve Citations", value=True)
223
+ max_variants = st.slider("Number of Variants", 1, 5, 3)
224
+
225
+ # Process button
226
+ process_button = st.button(
227
+ f"🚀 Run {selected_tool}",
228
+ type="primary",
229
+ disabled=not input_text.strip()
230
+ )
231
+
232
+ def process_text(tool_type: str, **kwargs):
233
+ """Process text with selected tool"""
234
+
235
+ if not st.session_state.current_text.strip():
236
+ st.warning("⚠️ Please enter some text to process")
237
+ return
238
+
239
+ # Load models if not already loaded
240
+ if st.session_state.paraphraser is None or st.session_state.plagiarism_remover is None:
241
+ paraphraser, plagiarism_remover = load_models()
242
+ if paraphraser and plagiarism_remover:
243
+ st.session_state.paraphraser = paraphraser
244
+ st.session_state.plagiarism_remover = plagiarism_remover
245
+ else:
246
+ st.error("❌ Failed to load models. Please refresh the page.")
247
+ return
248
+
249
+ # Process text
250
+ try:
251
+ with st.spinner(f"🔄 Processing with {tool_type}..."):
252
+ start_time = time.time()
253
+
254
+ if tool_type == "Academic Paraphraser":
255
+ variants = st.session_state.paraphraser.paraphrase_academic_text(
256
+ text=st.session_state.current_text,
257
+ preserve_citations=kwargs.get('preserve_citations', True),
258
+ preserve_technical_terms=kwargs.get('preserve_terms', True),
259
+ creativity_level=kwargs.get('creativity', 0.7),
260
+ max_variants=kwargs.get('max_variants', 3)
261
+ )
262
+
263
+ # Calculate quality metrics
264
+ quality_metrics = []
265
+ for variant in variants:
266
+ metrics = st.session_state.paraphraser.get_paraphrase_quality_score(
267
+ st.session_state.current_text, variant
268
+ )
269
+ quality_metrics.append(metrics)
270
+
271
+ else: # Plagiarism Remover
272
+ variants = st.session_state.plagiarism_remover.remove_plagiarism_advanced(
273
+ text=st.session_state.current_text,
274
+ aggressiveness=kwargs.get('aggressiveness', 0.8),
275
+ preserve_technical_terms=kwargs.get('preserve_terms', True),
276
+ preserve_citations=kwargs.get('preserve_citations', True),
277
+ max_variants=kwargs.get('max_variants', 3)
278
+ )
279
+
280
+ # Calculate uniqueness metrics
281
+ quality_metrics = []
282
+ for variant in variants:
283
+ metrics = st.session_state.plagiarism_remover.get_uniqueness_score(variant)
284
+ quality_metrics.append(metrics)
285
+
286
+ processing_time = time.time() - start_time
287
+
288
+ # Store results
289
+ st.session_state.processed_variants = variants
290
+ st.session_state.quality_metrics = quality_metrics
291
+ st.session_state.current_variant_index = 0
292
+
293
+ # Add to history
294
+ st.session_state.processing_history.append({
295
+ 'tool': tool_type,
296
+ 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
297
+ 'processing_time': round(processing_time, 2),
298
+ 'variants_count': len(variants),
299
+ 'original_length': len(st.session_state.current_text.split()),
300
+ })
301
+
302
+ st.success(f"✅ Processing completed in {processing_time:.2f} seconds!")
303
+ st.success(f"Generated {len(variants)} high-quality variants")
304
+
305
+ except Exception as e:
306
+ st.error(f"❌ Processing failed: {str(e)}")
307
+ logger.error(f"Processing error: {str(e)}")
308
+
309
+ def display_results():
310
+ """Display processing results with quality metrics"""
311
+
312
+ if not st.session_state.processed_variants:
313
+ return
314
+
315
+ st.markdown("---")
316
+ st.markdown("### 📊 Results & Quality Analysis")
317
+
318
+ # Variant navigation
319
+ col1, col2, col3 = st.columns([1, 2, 1])
320
+
321
+ with col1:
322
+ if st.button("◀ Previous", disabled=st.session_state.current_variant_index == 0):
323
+ st.session_state.current_variant_index -= 1
324
+ st.rerun()
325
+
326
+ with col2:
327
+ variant_selector = st.selectbox(
328
+ "Select Variant:",
329
+ range(len(st.session_state.processed_variants)),
330
+ index=st.session_state.current_variant_index,
331
+ format_func=lambda x: f"Variant {x+1}"
332
+ )
333
+ if variant_selector != st.session_state.current_variant_index:
334
+ st.session_state.current_variant_index = variant_selector
335
+ st.rerun()
336
+
337
+ with col3:
338
+ if st.button("Next ▶", disabled=st.session_state.current_variant_index >= len(st.session_state.processed_variants) - 1):
339
+ st.session_state.current_variant_index += 1
340
+ st.rerun()
341
+
342
+ # Current variant display
343
+ current_variant = st.session_state.processed_variants[st.session_state.current_variant_index]
344
+ current_metrics = st.session_state.quality_metrics[st.session_state.current_variant_index] if st.session_state.quality_metrics else {}
345
+
346
+ # Side-by-side comparison
347
+ col_orig, col_proc = st.columns(2)
348
+
349
+ with col_orig:
350
+ st.markdown("#### 📄 Original Text")
351
+ st.text_area("", value=st.session_state.current_text, height=200, disabled=True, key="orig_display")
352
+
353
+ with col_proc:
354
+ st.markdown(f"#### ✨ Variant {st.session_state.current_variant_index + 1}")
355
+ st.text_area("", value=current_variant, height=200, key=f"variant_display_{st.session_state.current_variant_index}")
356
+
357
+ # Quality metrics visualization
358
+ if current_metrics:
359
+ st.markdown("#### 📈 Quality Metrics")
360
+
361
+ # Create metrics dataframe for visualization
362
+ if 'semantic_similarity' in current_metrics:
363
+ # Paraphraser metrics
364
+ metrics_data = {
365
+ 'Metric': ['Semantic Similarity', 'Lexical Diversity', 'Length Preservation', 'Overall Quality'],
366
+ 'Score': [
367
+ current_metrics.get('semantic_similarity', 0),
368
+ current_metrics.get('lexical_diversity', 0),
369
+ current_metrics.get('length_preservation', 0),
370
+ current_metrics.get('overall_quality', 0)
371
+ ]
372
+ }
373
+ else:
374
+ # Plagiarism remover metrics
375
+ metrics_data = {
376
+ 'Metric': ['Uniqueness Score', 'Phrase Originality', 'Overall Safety'],
377
+ 'Score': [
378
+ current_metrics.get('uniqueness_score', 0),
379
+ current_metrics.get('phrase_originality', 0),
380
+ 1.0 - current_metrics.get('plagiarism_risk', 0)
381
+ ]
382
+ }
383
+
384
+ df_metrics = pd.DataFrame(metrics_data)
385
+
386
+ # Create bar chart
387
+
models/__init__.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Engineering Academic Paraphraser - Models Package
3
+ =================================================
4
+
5
+ This package contains the core AI models for academic text paraphrasing
6
+ and plagiarism removal, specifically designed for engineering domains.
7
+
8
+ Models:
9
+ - model1_paraphraser.py: T5-based academic paraphraser
10
+ - model2_plagiarism_remover.py: DistilBERT-based plagiarism remover
11
+
12
+ Utils:
13
+ - text_processor.py: Text preprocessing and postprocessing
14
+ - quality_checker.py: Quality assessment and metrics
15
+ - engineering_terms.py: Domain-specific terminology protection
16
+
17
+ Version: 1.0.0
18
+ Author: Engineering Academic Tools
19
+ License: MIT
20
+ """
21
+
22
+ from .model1_paraphraser import AcademicParaphraser
23
+ from .model2_plagiarism_remover import PlagiarismRemover
24
+
25
+ # Import utility classes
26
+ from .utils.text_processor import TextProcessor
27
+ from .utils.quality_checker import QualityChecker
28
+ from .utils.engineering_terms import EngineeringTerms
29
+
30
+ # Package metadata
31
+ __version__ = "1.0.0"
32
+ __author__ = "Engineering Academic Tools"
33
+ __email__ = "support@engacademictools.com"
34
+ __description__ = "Professional AI models for engineering academic text processing"
35
+
36
+ # Available models and utilities
37
+ __all__ = [
38
+ 'AcademicParaphraser',
39
+ 'PlagiarismRemover',
40
+ 'TextProcessor',
41
+ 'QualityChecker',
42
+ 'EngineeringTerms',
43
+ ]
44
+
45
+ # Model configurations
46
+ MODEL_CONFIGS = {
47
+ 'paraphraser': {
48
+ 'name': 'Academic Paraphraser',
49
+ 'base_model': 't5-base',
50
+ 'max_length': 512,
51
+ 'domains': ['mechanical', 'electrical', 'computer_science', 'civil']
52
+ },
53
+ 'plagiarism_remover': {
54
+ 'name': 'Plagiarism Remover',
55
+ 'base_model': 'distilbert-base-uncased',
56
+ 'similarity_threshold': 0.7,
57
+ 'min_changes_required': 3
58
+ }
59
+ }
60
+
61
+ # Supported engineering domains
62
+ ENGINEERING_DOMAINS = [
63
+ 'mechanical_engineering',
64
+ 'electrical_engineering',
65
+ 'computer_science',
66
+ 'civil_engineering',
67
+ 'chemical_engineering',
68
+ 'aerospace_engineering'
69
+ ]
70
+
71
+ def get_model_info():
72
+ """Get information about available models"""
73
+ return {
74
+ 'models': list(MODEL_CONFIGS.keys()),
75
+ 'domains': ENGINEERING_DOMAINS,
76
+ 'version': __version__
77
+ }
78
+
79
+ def initialize_models():
80
+ """Initialize all models with default configurations"""
81
+ paraphraser = AcademicParaphraser()
82
+ plagiarism_remover = PlagiarismRemover()
83
+
84
+ return {
85
+ 'paraphraser': paraphraser,
86
+ 'plagiarism_remover': plagiarism_remover
87
+ }
models/config/model_config.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # FILE 2: model_config.py
3
+ # =======================
4
+ MODEL_CONFIG_PY = """
5
+ import os
6
+ from pathlib import Path
7
+
8
+ class ModelConfig:
9
+ # Model Settings
10
+ PARAPHRASER_MODEL = "t5-small"
11
+ PLAGIARISM_MODEL = "distilbert-base-uncased"
12
+ SENTENCE_MODEL = "all-MiniLM-L6-v2"
13
+
14
+ # Processing Settings
15
+ MAX_LENGTH = 512
16
+ MIN_SIMILARITY_THRESHOLD = 0.7
17
+ BATCH_SIZE = 8
18
+
19
+ # Engineering Domain Terms
20
+ PROTECTED_TERMS = [
21
+ "algorithm", "methodology", "framework", "architecture",
22
+ "coefficient", "parameter", "variable", "function",
23
+ "equation", "formula", "theorem", "hypothesis",
24
+ "IEEE", "ASME", "ASCE", "ISO", "ANSI"
25
+ ]
26
+
27
+ # Academic Patterns to Preserve
28
+ CITATION_PATTERNS = [
29
+ r'\[\d+\]', # [1], [23]
30
+ r'\([^)]*\d{4}[^)]*\)', # (Author, 2023)
31
+ r'et al\.', # et al.
32
+ r'Figure \d+', # Figure 1
33
+ r'Table \d+', # Table 1
34
+ r'Equation \d+', # Equation 1
35
+ ]
36
+
37
+ # File Paths
38
+ BASE_DIR = Path(__file__).parent.parent
39
+ MODELS_DIR = BASE_DIR / "models"
40
+ CACHE_DIR = BASE_DIR / "cache"
41
+
42
+ @classmethod
43
+ def ensure_directories(cls):
44
+ cls.CACHE_DIR.mkdir(exist_ok=True)
45
+ cls.MODELS_DIR.mkdir(exist_ok=True)
46
+ """
models/config/requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FILE 1: requirements.txt
2
+ # ========================
3
+ REQUIREMENTS_TXT = """
4
+ streamlit>=1.28.0
5
+ transformers>=4.35.0
6
+ torch>=2.0.0
7
+ sentence-transformers>=2.2.2
8
+ nltk>=3.8
9
+ spacy>=3.7.0
10
+ scikit-learn>=1.3.0
11
+ numpy>=1.24.0
12
+ pandas>=2.0.0
13
+ python-docx>=0.8.11
14
+ PyMuPDF>=1.23.0
15
+ language-tool-python>=2.7.1
16
+ textblob>=0.17.1
17
+ huggingface-hub>=0.17.0
18
+ accelerate>=0.24.0
19
+ ""
models/model1_paraphraser.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FILE: models/model1_paraphraser.py
2
+ # ===================================
3
+
4
+ import torch
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
6
+ from sentence_transformers import SentenceTransformer
7
+ import re
8
+ import logging
9
+ from typing import List, Dict, Tuple, Optional
10
+ import streamlit as st
11
+
12
+ from .utils.text_processor import AcademicTextProcessor
13
+ from .utils.engineering_terms import EngineeringTermsProtector
14
+ from config.model_config import ModelConfig
15
+
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ class EngineeringParaphraser:
20
+ """
21
+ Professional academic paraphraser for engineering texts.
22
+ Focuses on maintaining technical accuracy while improving readability.
23
+ """
24
+
25
+ def __init__(self, model_name: str = "t5-small"):
26
+ self.model_name = model_name
27
+ self.tokenizer = None
28
+ self.model = None
29
+ self.paraphrase_pipeline = None
30
+ self.text_processor = AcademicTextProcessor()
31
+ self.terms_protector = EngineeringTermsProtector()
32
+ self.quality_threshold = ModelConfig.MIN_SIMILARITY_THRESHOLD
33
+
34
+ @st.cache_resource
35
+ def load_model(_self):
36
+ """Load T5 model with caching for Streamlit"""
37
+ try:
38
+ logger.info(f"Loading paraphraser model: {_self.model_name}")
39
+
40
+ _self.tokenizer = AutoTokenizer.from_pretrained(_self.model_name)
41
+ _self.model = AutoModelForSeq2SeqLM.from_pretrained(_self.model_name)
42
+
43
+ # Create pipeline for easier inference
44
+ _self.paraphrase_pipeline = pipeline(
45
+ "text2text-generation",
46
+ model=_self.model,
47
+ tokenizer=_self.tokenizer,
48
+ device=-1, # CPU (change to 0 for GPU)
49
+ max_length=ModelConfig.MAX_LENGTH
50
+ )
51
+
52
+ logger.info("✅ Paraphraser model loaded successfully")
53
+ return True
54
+
55
+ except Exception as e:
56
+ logger.error(f"❌ Error loading model: {str(e)}")
57
+ return False
58
+
59
+ def _paraphrase_sentence(self, sentence: str, creativity: float = 0.7) -> List[str]:
60
+ """Paraphrase a single sentence with multiple variants"""
61
+ if not self.paraphrase_pipeline:
62
+ self.load_model()
63
+
64
+ try:
65
+ # Prepare input for T5
66
+ input_text = f"paraphrase: {sentence}"
67
+
68
+ # Generate multiple variants
69
+ results = self.paraphrase_pipeline(
70
+ input_text,
71
+ max_length=len(sentence.split()) * 2 + 20,
72
+ num_return_sequences=3,
73
+ do_sample=True,
74
+ temperature=creativity,
75
+ top_p=0.9,
76
+ repetition_penalty=1.2
77
+ )
78
+
79
+ variants = []
80
+ for result in results:
81
+ paraphrased = result['generated_text'].strip()
82
+
83
+ # Clean up T5 artifacts
84
+ paraphrased = self._clean_t5_output(paraphrased)
85
+
86
+ # Quality check
87
+ if self._is_good_paraphrase(sentence, paraphrased):
88
+ variants.append(paraphrased)
89
+
90
+ return variants[:2] if variants else [sentence] # Max 2 variants
91
+
92
+ except Exception as e:
93
+ logger.warning(f"Paraphrase failed for sentence, returning original: {str(e)}")
94
+ return [sentence]
95
+
96
+ def _clean_t5_output(self, text: str) -> str:
97
+ """Clean T5 model output artifacts"""
98
+ # Remove common T5 artifacts
99
+ text = re.sub(r'^paraphrase:\s*', '', text, flags=re.IGNORECASE)
100
+ text = re.sub(r'<.*?>', '', text) # Remove special tokens
101
+ text = text.strip()
102
+
103
+ # Capitalize first letter
104
+ if text and text[0].islower():
105
+ text = text[0].upper() + text[1:]
106
+
107
+ return text
108
+
109
+ def _is_good_paraphrase(self, original: str, paraphrased: str) -> bool:
110
+ """Check if paraphrase meets quality standards"""
111
+ # Basic checks
112
+ if not paraphrased or len(paraphrased.split()) < 3:
113
+ return False
114
+
115
+ # Check similarity (should be similar but not identical)
116
+ similarity = self.text_processor.calculate_similarity(original, paraphrased)
117
+
118
+ if similarity < 0.6: # Too different
119
+ return False
120
+ if similarity > 0.95: # Too similar
121
+ return False
122
+
123
+ # Check for academic quality
124
+ if not self.text_processor.is_academic_quality(paraphrased):
125
+ return False
126
+
127
+ return True
128
+
129
+ def paraphrase_academic_text(
130
+ self,
131
+ text: str,
132
+ preserve_citations: bool = True,
133
+ preserve_technical_terms: bool = True,
134
+ creativity_level: float = 0.7,
135
+ max_variants: int = 3
136
+ ) -> List[str]:
137
+ """
138
+ Main paraphrasing function for academic engineering texts.
139
+
140
+ Args:
141
+ text: Input academic text
142
+ preserve_citations: Whether to preserve citations and references
143
+ preserve_technical_terms: Whether to preserve technical terminology
144
+ creativity_level: How creative the paraphrasing should be (0.1-1.0)
145
+ max_variants: Maximum number of variants to generate
146
+
147
+ Returns:
148
+ List of paraphrased variants
149
+ """
150
+ try:
151
+ logger.info("🔄 Starting academic text paraphrasing...")
152
+
153
+ # Step 1: Clean input text
154
+ cleaned_text = self.text_processor.clean_text(text)
155
+
156
+ # Step 2: Preserve citations if requested
157
+ citation_map = {}
158
+ if preserve_citations:
159
+ cleaned_text, citation_map = self.text_processor.preserve_citations(cleaned_text)
160
+
161
+ # Step 3: Preserve technical terms if requested
162
+ term_map = {}
163
+ if preserve_technical_terms:
164
+ cleaned_text, term_map = self.terms_protector.protect_terms_in_text(cleaned_text)
165
+
166
+ # Step 4: Split into sentences for better processing
167
+ sentences = self.text_processor.split_into_sentences(cleaned_text)
168
+
169
+ # Step 5: Paraphrase each sentence
170
+ all_variants = []
171
+
172
+ for variant_num in range(max_variants):
173
+ paraphrased_sentences = []
174
+
175
+ for sentence in sentences:
176
+ if len(sentence.split()) < 4: # Skip very short sentences
177
+ paraphrased_sentences.append(sentence)
178
+ continue
179
+
180
+ variants = self._paraphrase_sentence(sentence, creativity_level)
181
+
182
+ # Choose variant based on variant_num
183
+ if variant_num < len(variants):
184
+ paraphrased_sentences.append(variants[variant_num])
185
+ else:
186
+ paraphrased_sentences.append(variants[0] if variants else sentence)
187
+
188
+ # Step 6: Combine sentences
189
+ combined_text = " ".join(paraphrased_sentences)
190
+
191
+ # Step 7: Restore protected elements
192
+ if preserve_technical_terms:
193
+ combined_text = self.terms_protector.restore_terms_in_text(combined_text, term_map)
194
+
195
+ if preserve_citations:
196
+ combined_text = self.text_processor.restore_citations(combined_text, citation_map)
197
+
198
+ # Step 8: Final cleaning
199
+ final_text = self.text_processor.clean_text(combined_text)
200
+
201
+ if final_text not in all_variants:
202
+ all_variants.append(final_text)
203
+
204
+ logger.info(f"✅ Generated {len(all_variants)} paraphrase variants")
205
+ return all_variants if all_variants else [text]
206
+
207
+ except Exception as e:
208
+ logger.error(f"❌ Paraphrasing failed: {str(e)}")
209
+ return [text] # Return original if everything fails
210
+
211
+ def get_paraphrase_quality_score(self, original: str, paraphrased: str) -> Dict[str, float]:
212
+ """Calculate quality metrics for a paraphrase"""
213
+ try:
214
+ similarity = self.text_processor.calculate_similarity(original, paraphrased)
215
+
216
+ # Lexical diversity (unique words / total words)
217
+ orig_words = set(original.lower().split())
218
+ para_words = set(paraphrased.lower().split())
219
+ lexical_change = len(para_words - orig_words) / max(len(orig_words), 1)
220
+
221
+ # Length similarity
222
+ length_ratio = len(paraphrased.split()) / max(len(original.split()), 1)
223
+ length_score = 1.0 - abs(1.0 - length_ratio)
224
+
225
+ return {
226
+ "semantic_similarity": round(similarity, 3),
227
+ "lexical_diversity": round(lexical_change, 3),
228
+ "length_preservation": round(length_score, 3),
229
+ "overall_quality": round((similarity + lexical_change + length_score) / 3, 3)
230
+ }
231
+
232
+ except Exception as e:
233
+ logger.warning(f"Quality scoring failed: {str(e)}")
234
+ return {
235
+ "semantic_similarity": 0.0,
236
+ "lexical_diversity": 0.0,
237
+ "length_preservation": 0.0,
238
+ "overall_quality": 0.0
239
+ }
240
+
241
+ # Usage example and testing
242
+ if __name__ == "__main__":
243
+ # Test the paraphraser
244
+ paraphraser = EngineeringParaphraser()
245
+
246
+ test_text = """
247
+ The algorithm demonstrates significant performance improvements in computational
248
+ efficiency when compared to traditional methods. The proposed framework utilizes
249
+ advanced optimization techniques to minimize processing time while maintaining
250
+ accuracy levels above 95%.
251
+ """
252
+
253
+ print("🧪 Testing Engineering Paraphraser...")
254
+ print(f"Original: {test_text}")
255
+ print("\n" + "="*50 + "\n")
256
+
257
+ variants = paraphraser.paraphrase_academic_text(
258
+ text=test_text,
259
+ max_variants=3,
260
+ creativity_level=0.7
261
+ )
262
+
263
+ for i, variant in enumerate(variants, 1):
264
+ print(f"Variant {i}: {variant}")
265
+
266
+ quality = paraphraser.get_paraphrase_quality_score(test_text, variant)
267
+ print(f"Quality Score: {quality}")
268
+ print("\n" + "-"*30 + "\n")
models/model2_plagiarism_remover ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FILE: models/model2_plagiarism_remover.py
2
+ # =========================================
3
+
4
+ import torch
5
+ from transformers import AutoTokenizer, AutoModel, pipeline
6
+ import numpy as np
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+ import re
10
+ import logging
11
+ from typing import List, Dict, Tuple, Set
12
+ import random
13
+ import streamlit as st
14
+
15
+ from .utils.text_processor import AcademicTextProcessor
16
+ from .utils.engineering_terms import EngineeringTermsProtector
17
+ from config.model_config import ModelConfig
18
+
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+ class EngineeringPlagiarismRemover:
23
+ """
24
+ Advanced plagiarism removal tool specifically designed for engineering academic texts.
25
+ Focuses on creating highly original content while preserving technical accuracy.
26
+ """
27
+
28
+ def __init__(self):
29
+ self.sentence_model = None
30
+ self.paraphrase_model = None
31
+ self.text_processor = AcademicTextProcessor()
32
+ self.terms_protector = EngineeringTermsProtector()
33
+ self.tfidf_vectorizer = TfidfVectorizer(
34
+ ngram_range=(1, 3),
35
+ max_features=5000,
36
+ stop_words='english'
37
+ )
38
+
39
+ # Plagiarism detection thresholds
40
+ self.similarity_threshold = 0.3 # Below this = unique
41
+ self.phrase_overlap_threshold = 0.2
42
+
43
+ @st.cache_resource
44
+ def load_models(_self):
45
+ """Load all required models with caching"""
46
+ try:
47
+ logger.info("🔄 Loading plagiarism removal models...")
48
+
49
+ # Load sentence transformer for semantic analysis
50
+ from sentence_transformers import SentenceTransformer
51
+ _self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
52
+
53
+ # Load paraphrasing model (lighter model for speed)
54
+ _self.paraphrase_model = pipeline(
55
+ "text2text-generation",
56
+ model="t5-small",
57
+ device=-1,
58
+ max_length=512
59
+ )
60
+
61
+ logger.info("✅ All models loaded successfully")
62
+ return True
63
+
64
+ except Exception as e:
65
+ logger.error(f"❌ Error loading models: {str(e)}")
66
+ return False
67
+
68
+ def detect_plagiarism_risk(self, text: str, reference_texts: List[str] = None) -> Dict[str, float]:
69
+ """
70
+ Analyze text for potential plagiarism risks.
71
+
72
+ Args:
73
+ text: Text to analyze
74
+ reference_texts: Optional list of reference texts to compare against
75
+
76
+ Returns:
77
+ Dictionary with risk scores and analysis
78
+ """
79
+ try:
80
+ if not self.sentence_model:
81
+ self.load_models()
82
+
83
+ analysis = {
84
+ "overall_risk": 0.0,
85
+ "phrase_overlap_risk": 0.0,
86
+ "semantic_similarity_risk": 0.0,
87
+ "unique_phrases_ratio": 0.0,
88
+ "recommendations": []
89
+ }
90
+
91
+ # Analyze phrase uniqueness
92
+ phrases = self._extract_phrases(text)
93
+ common_phrases = self._identify_common_phrases(phrases)
94
+ analysis["phrase_overlap_risk"] = len(common_phrases) / max(len(phrases), 1)
95
+ analysis["unique_phrases_ratio"] = 1.0 - analysis["phrase_overlap_risk"]
96
+
97
+ # If reference texts provided, check semantic similarity
98
+ if reference_texts:
99
+ similarities = []
100
+ text_embedding = self.sentence_model.encode([text])
101
+
102
+ for ref_text in reference_texts:
103
+ ref_embedding = self.sentence_model.encode([ref_text])
104
+ sim = cosine_similarity(text_embedding, ref_embedding)[0][0]
105
+ similarities.append(sim)
106
+
107
+ analysis["semantic_similarity_risk"] = max(similarities) if similarities else 0.0
108
+
109
+ # Calculate overall risk
110
+ analysis["overall_risk"] = (
111
+ analysis["phrase_overlap_risk"] * 0.6 +
112
+ analysis["semantic_similarity_risk"] * 0.4
113
+ )
114
+
115
+ # Generate recommendations
116
+ if analysis["overall_risk"] > 0.7:
117
+ analysis["recommendations"].append("HIGH RISK: Major rewriting needed")
118
+ elif analysis["overall_risk"] > 0.4:
119
+ analysis["recommendations"].append("MEDIUM RISK: Significant paraphrasing recommended")
120
+ else:
121
+ analysis["recommendations"].append("LOW RISK: Minor adjustments sufficient")
122
+
123
+ return analysis
124
+
125
+ except Exception as e:
126
+ logger.error(f"Plagiarism detection failed: {str(e)}")
127
+ return {"overall_risk": 0.0, "error": str(e)}
128
+
129
+ def _extract_phrases(self, text: str, min_length: int = 4) -> List[str]:
130
+ """Extract meaningful phrases from text"""
131
+ sentences = self.text_processor.split_into_sentences(text)
132
+ phrases = []
133
+
134
+ for sentence in sentences:
135
+ words = sentence.split()
136
+ # Extract n-grams of different lengths
137
+ for n in range(min_length, min(len(words) + 1, 8)):
138
+ for i in range(len(words) - n + 1):
139
+ phrase = " ".join(words[i:i+n])
140
+ if self._is_meaningful_phrase(phrase):
141
+ phrases.append(phrase.lower())
142
+
143
+ return phrases
144
+
145
+ def _is_meaningful_phrase(self, phrase: str) -> bool:
146
+ """Check if phrase is meaningful (not just common words)"""
147
+ stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had'}
148
+ words = phrase.lower().split()
149
+
150
+ # Skip if too many stop words
151
+ stop_word_ratio = sum(1 for word in words if word in stop_words) / len(words)
152
+ if stop_word_ratio > 0.7:
153
+ return False
154
+
155
+ # Must contain at least one meaningful word
156
+ meaningful_words = [word for word in words if word not in stop_words and len(word) > 2]
157
+ return len(meaningful_words) >= 2
158
+
159
+ def _identify_common_phrases(self, phrases: List[str]) -> Set[str]:
160
+ """Identify commonly used phrases that increase plagiarism risk"""
161
+ common_academic_phrases = {
162
+ "in this study", "the results show", "it can be concluded",
163
+ "the purpose of this", "according to the", "as shown in figure",
164
+ "the
models/utils/__init__.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Engineering Academic Paraphraser - Utilities Package
3
+ ===================================================
4
+
5
+ Utility modules for text processing, quality assessment, and
6
+ engineering domain-specific operations.
7
+
8
+ Modules:
9
+ - text_processor.py: Text preprocessing and postprocessing utilities
10
+ - quality_checker.py: Quality metrics and assessment tools
11
+ - engineering_terms.py: Engineering terminology and domain vocabulary
12
+
13
+ Version: 1.0.0
14
+ """
15
+
16
+ from .text_processor import TextProcessor
17
+ from .quality_checker import QualityChecker
18
+ from .engineering_terms import EngineeringTerms
19
+
20
+ # Package metadata
21
+ __version__ = "1.0.0"
22
+ __all__ = ['TextProcessor', 'QualityChecker', 'EngineeringTerms']
23
+
24
+ # Utility configurations
25
+ UTILS_CONFIG = {
26
+ 'text_processor': {
27
+ 'min_sentence_length': 10,
28
+ 'max_sentence_length': 500,
29
+ 'preserve_formatting': True
30
+ },
31
+ 'quality_checker': {
32
+ 'similarity_threshold': 0.7,
33
+ 'readability_min_score': 30,
34
+ 'grammar_check_enabled': True
35
+ },
36
+ 'engineering_terms': {
37
+ 'protection_enabled': True,
38
+ 'case_sensitive': True,
39
+ 'domain_specific': True
40
+ }
41
+ }
42
+
43
+ def get_utils_info():
44
+ """Get information about available utilities"""
45
+ return {
46
+ 'utilities': __all__,
47
+ 'config': UTILS_CONFIG,
48
+ 'version': __version__
49
+ }
models/utils/engineering_terms.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # FILE 3: engineering_terms.py
3
+ # =============================
4
+ ENGINEERING_TERMS_PY = """
5
+ import re
6
+ from typing import List, Dict, Set
7
+
8
+ class EngineeringTermsProtector:
9
+ def __init__(self):
10
+ self.technical_terms = {
11
+ 'general': [
12
+ 'algorithm', 'methodology', 'framework', 'architecture',
13
+ 'optimization', 'simulation', 'modeling', 'analysis',
14
+ 'coefficient', 'parameter', 'variable', 'function',
15
+ 'equation', 'formula', 'theorem', 'hypothesis'
16
+ ],
17
+ 'mechanical': [
18
+ 'torque', 'stress', 'strain', 'fatigue', 'fracture',
19
+ 'thermodynamics', 'heat transfer', 'fluid dynamics',
20
+ 'kinematics', 'dynamics', 'statics'
21
+ ],
22
+ 'electrical': [
23
+ 'voltage', 'current', 'resistance', 'impedance',
24
+ 'capacitance', 'inductance', 'frequency', 'amplifier',
25
+ 'transistor', 'diode', 'circuit', 'microcontroller'
26
+ ],
27
+ 'computer_science': [
28
+ 'algorithm', 'data structure', 'complexity', 'recursion',
29
+ 'database', 'network', 'protocol', 'encryption',
30
+ 'API', 'framework', 'library', 'compiler'
31
+ ],
32
+ 'civil': [
33
+ 'concrete', 'steel', 'foundation', 'beam', 'column',
34
+ 'load', 'moment', 'shear', 'deflection', 'buckling'
35
+ ]
36
+ }
37
+
38
+ self.units = [
39
+ 'Hz', 'kHz', 'MHz', 'GHz', 'V', 'mV', 'kV', 'A', 'mA',
40
+ 'Ω', 'kΩ', 'MΩ', 'F', 'μF', 'nF', 'pF', 'H', 'mH', 'μH',
41
+ 'W', 'kW', 'MW', 'J', 'kJ', 'MJ', 'N', 'kN', 'Pa', 'kPa',
42
+ 'MPa', 'GPa', 'm', 'mm', 'cm', 'km', 'kg', 'g', 'mg'
43
+ ]
44
+
45
+ self.abbreviations = [
46
+ 'IEEE', 'ASME', 'ASCE', 'ISO', 'ANSI', 'ASTM', 'IEC',
47
+ 'API', 'GUI', 'CPU', 'GPU', 'RAM', 'ROM', 'USB', 'TCP',
48
+ 'IP', 'HTTP', 'HTTPS', 'FTP', 'DNS', 'SQL', 'XML', 'JSON'
49
+ ]
50
+
51
+ def get_all_terms(self) -> Set[str]:
52
+ """Get all technical terms to protect"""
53
+ all_terms = set()
54
+ for category in self.technical_terms.values():
55
+ all_terms.update(category)
56
+ all_terms.update(self.units)
57
+ all_terms.update(self.abbreviations)
58
+ return all_terms
59
+
60
+ def protect_terms_in_text(self, text: str) -> tuple[str, Dict[str, str]]:
61
+ """Replace technical terms with placeholders"""
62
+ protected_text = text
63
+ term_map = {}
64
+
65
+ all_terms = self.get_all_terms()
66
+
67
+ for i, term in enumerate(all_terms):
68
+ if term.lower() in text.lower():
69
+ placeholder = f"TECHTERM{i}"
70
+ # Case-insensitive replacement but preserve original case
71
+ pattern = re.compile(re.escape(term), re.IGNORECASE)
72
+ matches = pattern.findall(text)
73
+ if matches:
74
+ original_term = matches[0] # Get the original case
75
+ term_map[placeholder] = original_term
76
+ protected_text = pattern.sub(placeholder, protected_text)
77
+
78
+ return protected_text, term_map
79
+
80
+ def restore_terms_in_text(self, text: str, term_map: Dict[str, str]) -> str:
81
+ """Restore technical terms from placeholders"""
82
+ restored_text = text
83
+ for placeholder, original_term in term_map.items():
84
+ restored_text = restored_text.replace(placeholder, original_term)
85
+ return restored_text
86
+ """
models/utils/quality_checker.py ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Quality Checker for Engineering Academic Text
3
+ ============================================
4
+
5
+ Comprehensive quality assessment tool for paraphrased academic content,
6
+ specifically designed for engineering domains.
7
+
8
+ Features:
9
+ - Similarity analysis between original and paraphrased text
10
+ - Readability assessment using multiple metrics
11
+ - Grammar and syntax checking
12
+ - Academic integrity verification
13
+ - Engineering terminology preservation check
14
+ - Citation and reference validation
15
+ """
16
+
17
+ import re
18
+ import nltk
19
+ from typing import Dict, List, Tuple, Any
20
+ from textstat import flesch_reading_ease, flesch_kincaid_grade, automated_readability_index
21
+ from difflib import SequenceMatcher
22
+ import spacy
23
+ from collections import Counter
24
+ import math
25
+
26
+ try:
27
+ from sentence_transformers import SentenceTransformer
28
+ SENTENCE_TRANSFORMERS_AVAILABLE = True
29
+ except ImportError:
30
+ SENTENCE_TRANSFORMERS_AVAILABLE = False
31
+
32
+ # Download required NLTK data
33
+ try:
34
+ nltk.data.find('tokenizers/punkt')
35
+ except LookupError:
36
+ nltk.download('punkt', quiet=True)
37
+
38
+ try:
39
+ nltk.data.find('corpora/stopwords')
40
+ except LookupError:
41
+ nltk.download('stopwords', quiet=True)
42
+
43
+ class QualityChecker:
44
+ """
45
+ Comprehensive quality assessment tool for engineering academic text
46
+ """
47
+
48
+ def __init__(self):
49
+ """Initialize the quality checker"""
50
+ # Load language model for advanced analysis
51
+ try:
52
+ self.nlp = spacy.load("en_core_web_sm")
53
+ except OSError:
54
+ print("Warning: spaCy English model not found. Some features may be limited.")
55
+ self.nlp = None
56
+
57
+ # Load sentence transformer for semantic similarity
58
+ if SENTENCE_TRANSFORMERS_AVAILABLE:
59
+ try:
60
+ self.similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
61
+ except Exception:
62
+ self.similarity_model = None
63
+ else:
64
+ self.similarity_model = None
65
+
66
+ # Quality thresholds
67
+ self.thresholds = {
68
+ 'min_similarity': 0.3, # Minimum semantic similarity
69
+ 'max_similarity': 0.85, # Maximum similarity (too high = potential plagiarism)
70
+ 'min_readability': 30, # Minimum readability score
71
+ 'min_word_change_ratio': 0.3, # Minimum ratio of changed words
72
+ 'max_repetition_ratio': 0.2 # Maximum allowed repetition
73
+ }
74
+
75
+ def comprehensive_quality_check(self, original_text: str, paraphrased_text: str,
76
+ domain: str = "general") -> Dict[str, Any]:
77
+ """
78
+ Perform comprehensive quality assessment
79
+
80
+ Args:
81
+ original_text: Original academic text
82
+ paraphrased_text: Paraphrased version
83
+ domain: Engineering domain (mechanical, electrical, etc.)
84
+
85
+ Returns:
86
+ Dictionary containing all quality metrics and overall score
87
+ """
88
+ results = {
89
+ 'overall_score': 0,
90
+ 'detailed_scores': {},
91
+ 'recommendations': [],
92
+ 'pass_criteria': {},
93
+ 'metrics': {}
94
+ }
95
+
96
+ # 1. Similarity Analysis
97
+ similarity_results = self.analyze_similarity(original_text, paraphrased_text)
98
+ results['detailed_scores']['similarity'] = similarity_results
99
+
100
+ # 2. Readability Assessment
101
+ readability_results = self.assess_readability(paraphrased_text)
102
+ results['detailed_scores']['readability'] = readability_results
103
+
104
+ # 3. Grammar and Syntax Check
105
+ grammar_results = self.check_grammar_syntax(paraphrased_text)
106
+ results['detailed_scores']['grammar'] = grammar_results
107
+
108
+ # 4. Academic Integrity Check
109
+ integrity_results = self.check_academic_integrity(original_text, paraphrased_text)
110
+ results['detailed_scores']['integrity'] = integrity_results
111
+
112
+ # 5. Terminology Preservation
113
+ terminology_results = self.check_terminology_preservation(original_text, paraphrased_text, domain)
114
+ results['detailed_scores']['terminology'] = terminology_results
115
+
116
+ # 6. Calculate overall score
117
+ results['overall_score'] = self.calculate_overall_score(results['detailed_scores'])
118
+
119
+ # 7. Generate recommendations
120
+ results['recommendations'] = self.generate_recommendations(results['detailed_scores'])
121
+
122
+ # 8. Determine pass criteria
123
+ results['pass_criteria'] = self.evaluate_pass_criteria(results['detailed_scores'])
124
+
125
+ return results
126
+
127
+ def analyze_similarity(self, original: str, paraphrased: str) -> Dict[str, float]:
128
+ """Analyze similarity between original and paraphrased text"""
129
+ results = {}
130
+
131
+ # 1. Lexical similarity (word overlap)
132
+ results['lexical_similarity'] = self.calculate_lexical_similarity(original, paraphrased)
133
+
134
+ # 2. Structural similarity (sentence structure)
135
+ results['structural_similarity'] = self.calculate_structural_similarity(original, paraphrased)
136
+
137
+ # 3. Semantic similarity (meaning preservation)
138
+ results['semantic_similarity'] = self.calculate_semantic_similarity(original, paraphrased)
139
+
140
+ # 4. Overall similarity score
141
+ results['overall_similarity'] = (
142
+ results['lexical_similarity'] * 0.3 +
143
+ results['structural_similarity'] * 0.2 +
144
+ results['semantic_similarity'] * 0.5
145
+ )
146
+
147
+ return results
148
+
149
+ def assess_readability(self, text: str) -> Dict[str, float]:
150
+ """Assess readability using multiple metrics"""
151
+ results = {}
152
+
153
+ try:
154
+ # Flesch Reading Ease (higher = easier)
155
+ results['flesch_ease'] = flesch_reading_ease(text)
156
+
157
+ # Flesch-Kincaid Grade Level
158
+ results['flesch_kincaid_grade'] = flesch_kincaid_grade(text)
159
+
160
+ # Automated Readability Index
161
+ results['automated_readability'] = automated_readability_index(text)
162
+
163
+ # Calculate average readability score
164
+ readability_scores = [
165
+ max(0, min(100, results['flesch_ease'])),
166
+ max(0, min(20, 20 - results['flesch_kincaid_grade'])) * 5,
167
+ max(0, min(20, 20 - results['automated_readability'])) * 5
168
+ ]
169
+ results['average_readability'] = sum(readability_scores) / len(readability_scores)
170
+
171
+ except Exception as e:
172
+ print(f"Readability assessment error: {e}")
173
+ results = {
174
+ 'flesch_ease': 50,
175
+ 'flesch_kincaid_grade': 12,
176
+ 'automated_readability': 12,
177
+ 'average_readability': 50
178
+ }
179
+
180
+ return results
181
+
182
+ def check_grammar_syntax(self, text: str) -> Dict[str, Any]:
183
+ """Check grammar and syntax quality"""
184
+ results = {
185
+ 'grammar_score': 85, # Default score
186
+ 'syntax_score': 85,
187
+ 'issues_found': [],
188
+ 'sentence_variety': 0,
189
+ 'word_variety': 0
190
+ }
191
+
192
+ if self.nlp:
193
+ doc = self.nlp(text)
194
+
195
+ # Check sentence variety (different lengths)
196
+ sentence_lengths = [len(sent.text.split()) for sent in doc.sents]
197
+ if sentence_lengths:
198
+ length_variance = self.calculate_variance(sentence_lengths)
199
+ results['sentence_variety'] = min(100, length_variance * 10)
200
+
201
+ # Check word variety (unique words ratio)
202
+ words = [token.text.lower() for token in doc if token.is_alpha]
203
+ if words:
204
+ unique_ratio = len(set(words)) / len(words)
205
+ results['word_variety'] = unique_ratio * 100
206
+
207
+ # Basic grammar checks
208
+ grammar_issues = []
209
+ for token in doc:
210
+ # Check for common issues
211
+ if token.dep_ == "ROOT" and token.pos_ != "VERB":
212
+ grammar_issues.append("Potential sentence structure issue")
213
+
214
+ results['issues_found'] = grammar_issues[:5] # Limit to 5 issues
215
+
216
+ # Adjust grammar score based on issues
217
+ results['grammar_score'] = max(60, 90 - len(grammar_issues) * 2)
218
+
219
+ return results
220
+
221
+ def check_academic_integrity(self, original: str, paraphrased: str) -> Dict[str, Any]:
222
+ """Check academic integrity and plagiarism indicators"""
223
+ results = {
224
+ 'plagiarism_risk': 'LOW',
225
+ 'direct_copying_ratio': 0,
226
+ 'phrase_similarity': 0,
227
+ 'citation_preserved': True,
228
+ 'integrity_score': 90
229
+ }
230
+
231
+ # Check for direct copying (exact phrases)
232
+ direct_matches = self.find_direct_matches(original, paraphrased)
233
+ results['direct_copying_ratio'] = len(direct_matches) / max(1, len(original.split()))
234
+
235
+ # Check phrase-level similarity
236
+ results['phrase_similarity'] = self.calculate_phrase_similarity(original, paraphrased)
237
+
238
+ # Check if citations are preserved
239
+ results['citation_preserved'] = self.check_citations_preserved(original, paraphrased)
240
+
241
+ # Determine plagiarism risk
242
+ if results['direct_copying_ratio'] > 0.3 or results['phrase_similarity'] > 0.8:
243
+ results['plagiarism_risk'] = 'HIGH'
244
+ results['integrity_score'] = 40
245
+ elif results['direct_copying_ratio'] > 0.15 or results['phrase_similarity'] > 0.6:
246
+ results['plagiarism_risk'] = 'MEDIUM'
247
+ results['integrity_score'] = 70
248
+ else:
249
+ results['plagiarism_risk'] = 'LOW'
250
+ results['integrity_score'] = 90
251
+
252
+ return results
253
+
254
+ def check_terminology_preservation(self, original: str, paraphrased: str, domain: str) -> Dict[str, Any]:
255
+ """Check if engineering terminology is properly preserved"""
256
+ results = {
257
+ 'terminology_score': 95,
258
+ 'technical_terms_preserved': [],
259
+ 'technical_terms_lost': [],
260
+ 'domain_accuracy': 90
261
+ }
262
+
263
+ # Define engineering terms by domain
264
+ engineering_terms = {
265
+ 'mechanical': ['torque', 'stress', 'strain', 'friction', 'thermodynamics', 'kinematics'],
266
+ 'electrical': ['voltage', 'current', 'resistance', 'capacitance', 'impedance', 'frequency'],
267
+ 'computer_science': ['algorithm', 'data structure', 'complexity', 'optimization', 'recursion'],
268
+ 'civil': ['concrete', 'steel', 'load', 'beam', 'foundation', 'structural']
269
+ }
270
+
271
+ domain_terms = engineering_terms.get(domain, [])
272
+
273
+ # Extract technical terms from both texts
274
+ original_terms = self.extract_technical_terms(original, domain_terms)
275
+ paraphrased_terms = self.extract_technical_terms(paraphrased, domain_terms)
276
+
277
+ # Check preservation
278
+ preserved = set(original_terms) & set(paraphrased_terms)
279
+ lost = set(original_terms) - set(paraphrased_terms)
280
+
281
+ results['technical_terms_preserved'] = list(preserved)
282
+ results['technical_terms_lost'] = list(lost)
283
+
284
+ # Calculate terminology score
285
+ if original_terms:
286
+ preservation_ratio = len(preserved) / len(set(original_terms))
287
+ results['terminology_score'] = preservation_ratio * 100
288
+
289
+ return results
290
+
291
+ def calculate_overall_score(self, detailed_scores: Dict) -> float:
292
+ """Calculate weighted overall quality score"""
293
+ weights = {
294
+ 'similarity': 0.25,
295
+ 'readability': 0.20,
296
+ 'grammar': 0.20,
297
+ 'integrity': 0.25,
298
+ 'terminology': 0.10
299
+ }
300
+
301
+ total_score = 0
302
+ for category, weight in weights.items():
303
+ if category in detailed_scores:
304
+ if category == 'similarity':
305
+ # For similarity, we want moderate similarity (not too high, not too low)
306
+ sim_score = detailed_scores[category]['overall_similarity']
307
+ if 0.4 <= sim_score <= 0.75:
308
+ score = 90
309
+ elif sim_score < 0.4:
310
+ score = sim_score * 150 # Low similarity penalty
311
+ else:
312
+ score = max(50, 100 - (sim_score - 0.75) * 200) # High similarity penalty
313
+ elif category == 'readability':
314
+ score = detailed_scores[category]['average_readability']
315
+ elif category == 'grammar':
316
+ score = (detailed_scores[category]['grammar_score'] +
317
+ detailed_scores[category]['syntax_score']) / 2
318
+ elif category == 'integrity':
319
+ score = detailed_scores[category]['integrity_score']
320
+ elif category == 'terminology':
321
+ score = detailed_scores[category]['terminology_score']
322
+ else:
323
+ score = 75 # Default score
324
+
325
+ total_score += score * weight
326
+
327
+ return min(100, max(0, total_score))
328
+
329
+ def generate_recommendations(self, detailed_scores: Dict) -> List[str]:
330
+ """Generate actionable recommendations based on scores"""
331
+ recommendations = []
332
+
333
+ # Similarity recommendations
334
+ if 'similarity' in detailed_scores:
335
+ sim_score = detailed_scores['similarity']['overall_similarity']
336
+ if sim_score > 0.8:
337
+ recommendations.append("⚠️ High similarity detected. Consider more diverse paraphrasing.")
338
+ elif sim_score < 0.3:
339
+ recommendations.append("⚠️ Low similarity. Ensure meaning is preserved.")
340
+
341
+ # Readability recommendations
342
+ if 'readability' in detailed_scores:
343
+ read_score = detailed_scores['readability']['average_readability']
344
+ if read_score < 40:
345
+ recommendations.append("📚 Improve readability by using simpler sentence structures.")
346
+ elif read_score > 80:
347
+ recommendations.append("📈 Consider using more sophisticated vocabulary for academic tone.")
348
+
349
+ # Grammar recommendations
350
+ if 'grammar' in detailed_scores:
351
+ grammar_score = detailed_scores['grammar']['grammar_score']
352
+ if grammar_score < 80:
353
+ recommendations.append("✏️ Review grammar and sentence structure.")
354
+
355
+ # Integrity recommendations
356
+ if 'integrity' in detailed_scores:
357
+ if detailed_scores['integrity']['plagiarism_risk'] != 'LOW':
358
+ recommendations.append("🔍 High plagiarism risk. Increase paraphrasing diversity.")
359
+
360
+ # Terminology recommendations
361
+ if 'terminology' in detailed_scores:
362
+ lost_terms = detailed_scores['terminology']['technical_terms_lost']
363
+ if lost_terms:
364
+ recommendations.append(f"🔧 Preserve technical terms: {', '.join(lost_terms[:3])}")
365
+
366
+ if not recommendations:
367
+ recommendations.append("✅ Quality looks good! Minor refinements may enhance clarity.")
368
+
369
+ return recommendations
370
+
371
+ def evaluate_pass_criteria(self, detailed_scores: Dict) -> Dict[str, bool]:
372
+ """Evaluate if text meets quality criteria"""
373
+ criteria = {}
374
+
375
+ # Similarity criteria
376
+ if 'similarity' in detailed_scores:
377
+ sim = detailed_scores['similarity']['overall_similarity']
378
+ criteria['appropriate_similarity'] = 0.3 <= sim <= 0.8
379
+
380
+ # Readability criteria
381
+ if 'readability' in detailed_scores:
382
+ read = detailed_scores['readability']['average_readability']
383
+ criteria['readable'] = read >= 30
384
+
385
+ # Integrity criteria
386
+ if 'integrity' in detailed_scores:
387
+ criteria['academically_sound'] = detailed_scores['integrity']['plagiarism_risk'] == 'LOW'
388
+
389
+ # Overall pass
390
+ criteria['overall_pass'] = all(criteria.values()) if criteria else False
391
+
392
+ return criteria
393
+
394
+ # Helper methods
395
+ def calculate_lexical_similarity(self, text1: str, text2: str) -> float:
396
+ """Calculate word-level similarity"""
397
+ words1 = set(text1.lower().split())
398
+ words2 = set(text2.lower().split())
399
+ intersection = words1 & words2
400
+ union = words1 | words2
401
+ return len(intersection) / len(union) if union else 0
402
+
403
+ def calculate_structural_similarity(self, text1: str, text2: str) -> float:
404
+ """Calculate sentence structure similarity"""
405
+ return SequenceMatcher(None, text1, text2).ratio()
406
+
407
+ def calculate_semantic_similarity(self, text1: str, text2: str) -> float:
408
+ """Calculate semantic similarity using embeddings"""
409
+ if self.similarity_model:
410
+ try:
411
+ embeddings = self.similarity_model.encode([text1, text2])
412
+ similarity = self.cosine_similarity(embeddings[0], embeddings[1])
413
+ return similarity
414
+ except Exception:
415
+ pass
416
+
417
+ # Fallback to simple word overlap
418
+ return self.calculate_lexical_similarity(text1, text2)
419
+
420
+ def cosine_similarity(self, vec1, vec2):
421
+ """Calculate cosine similarity between two vectors"""
422
+ dot_product = sum(a * b for a, b in zip(vec1, vec2))
423
+ magnitude1 = math.sqrt(sum(a * a for a in vec1))
424
+ magnitude2 = math.sqrt(sum(a * a for a in vec2))
425
+ if magnitude1 == 0 or magnitude2 == 0:
426
+ return 0
427
+ return dot_product / (magnitude1 * magnitude2)
428
+
429
+ def find_direct_matches(self, text1: str, text2: str, min_length: int = 4) -> List[str]:
430
+ """Find exact phrase matches between texts"""
431
+ words1 = text1.lower().split()
432
+ words2 = text2.lower().split()
433
+ matches = []
434
+
435
+ for i in range(len(words1) - min_length + 1):
436
+ phrase = ' '.join(words1[i:i+min_length])
437
+ if phrase in ' '.join(words2):
438
+ matches.append(phrase)
439
+
440
+ return matches
441
+
442
+ def calculate_phrase_similarity(self, text1: str, text2: str) -> float:
443
+ """Calculate similarity at phrase level"""
444
+ sentences1 = nltk.sent_tokenize(text1)
445
+ sentences2 = nltk.sent_tokenize(text2)
446
+
447
+ similarities = []
448
+ for s1 in sentences1:
449
+ for s2 in sentences2:
450
+ sim = SequenceMatcher(None, s1.lower(), s2.lower()).ratio()
451
+ similarities.append(sim)
452
+
453
+ return max(similarities) if similarities else 0
454
+
455
+ def check_citations_preserved(self, original: str, paraphrased: str) -> bool:
456
+ """Check if citations are preserved"""
457
+ citation_patterns = [
458
+ r'\([^)]*\d{4}[^)]*\)', # (Author, 2023)
459
+ r'\[\d+\]', # [1]
460
+ r'\b\d{4}\b', # 2023
461
+ ]
462
+
463
+ original_citations = []
464
+ paraphrased_citations = []
465
+
466
+ for pattern in citation_patterns:
467
+ original_citations.extend(re.findall(pattern, original))
468
+ paraphrased_citations.extend(re.findall(pattern, paraphrased))
469
+
470
+ # Check if most citations are preserved
471
+ if not original_citations:
472
+ return True # No citations to preserve
473
+
474
+ preserved = len(set(original_citations) & set(paraphrased_citations))
475
+ return preserved >= len(original_citations) * 0.8 # 80% preservation rate
476
+
477
+ def extract_technical_terms(self, text: str, domain_terms: List[str]) -> List[str]:
478
+ """Extract technical terms from text"""
479
+ text_lower = text.lower()
480
+ found_terms = []
481
+
482
+ for term in domain_terms:
483
+ if term.lower() in text_lower:
484
+ found_terms.append(term)
485
+
486
+ # Also look for capitalized technical terms (likely proper nouns)
487
+ words = text.split()
488
+ for word in words:
489
+ if word[0].isupper() and len(word) > 3 and word.isalpha():
490
+ found_terms.append(word)
491
+
492
+ return found_terms
493
+
494
+ def calculate_variance(self, numbers: List[float]) -> float:
495
+ """Calculate variance of a list of numbers"""
496
+ if not numbers:
497
+ return 0
498
+ mean = sum(numbers) / len(numbers)
499
+ variance = sum((x - mean) ** 2 for x in numbers) / len(numbers)
500
+ return variance
models/utils/text_processor.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # FILE 4: text_processor.py
3
+ # ==========================
4
+ TEXT_PROCESSOR_PY = """
5
+ import re
6
+ import nltk
7
+ from typing import List, Tuple
8
+ from sentence_transformers import SentenceTransformer
9
+ import numpy as np
10
+
11
+ try:
12
+ nltk.download('punkt', quiet=True)
13
+ nltk.download('stopwords', quiet=True)
14
+ except:
15
+ pass
16
+
17
+ class AcademicTextProcessor:
18
+ def __init__(self):
19
+ self.sentence_model = None
20
+ self.citation_patterns = [
21
+ r'\[\d+\]', # [1], [23]
22
+ r'\([^)]*\d{4}[^)]*\)', # (Author, 2023)
23
+ r'et al\.', # et al.
24
+ r'Figure \s*\d+', # Figure 1
25
+ r'Table \s*\d+', # Table 1
26
+ r'Equation \s*\d+', # Equation 1
27
+ r'Section \s*\d+', # Section 1
28
+ ]
29
+
30
+ def load_sentence_model(self):
31
+ """Lazy load sentence transformer"""
32
+ if self.sentence_model is None:
33
+ self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
34
+ return self.sentence_model
35
+
36
+ def preserve_citations(self, text: str) -> Tuple[str, dict]:
37
+ """Extract and preserve citations/references"""
38
+ protected_text = text
39
+ citation_map = {}
40
+
41
+ for i, pattern in enumerate(self.citation_patterns):
42
+ matches = re.findall(pattern, text)
43
+ for j, match in enumerate(matches):
44
+ placeholder = f"CITATION{i}_{j}"
45
+ citation_map[placeholder] = match
46
+ protected_text = protected_text.replace(match, placeholder, 1)
47
+
48
+ return protected_text, citation_map
49
+
50
+ def restore_citations(self, text: str, citation_map: dict) -> str:
51
+ """Restore citations from placeholders"""
52
+ restored_text = text
53
+ for placeholder, original in citation_map.items():
54
+ restored_text = restored_text.replace(placeholder, original)
55
+ return restored_text
56
+
57
+ def split_into_sentences(self, text: str) -> List[str]:
58
+ """Split text into sentences while preserving academic structure"""
59
+ # Handle academic abbreviations that shouldn't split sentences
60
+ text = re.sub(r'et al\.', 'et al<DOT>', text)
61
+ text = re.sub(r'Fig\.', 'Fig<DOT>', text)
62
+ text = re.sub(r'Table\.', 'Table<DOT>', text)
63
+
64
+ try:
65
+ sentences = nltk.sent_tokenize(text)
66
+ except:
67
+ # Fallback if NLTK fails
68
+ sentences = re.split(r'[.!?]+\s+', text)
69
+
70
+ # Restore abbreviations
71
+ sentences = [s.replace('<DOT>', '.') for s in sentences]
72
+ return [s.strip() for s in sentences if s.strip()]
73
+
74
+ def calculate_similarity(self, text1: str, text2: str) -> float:
75
+ """Calculate semantic similarity between two texts"""
76
+ model = self.load_sentence_model()
77
+ embeddings = model.encode([text1, text2])
78
+ similarity = np.dot(embeddings[0], embeddings[1]) / (
79
+ np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
80
+ )
81
+ return float(similarity)
82
+
83
+ def is_academic_quality(self, text: str) -> bool:
84
+ """Check if text maintains academic quality"""
85
+ # Check for minimum length
86
+ if len(text.split()) < 5:
87
+ return False
88
+
89
+ # Check for academic markers
90
+ academic_markers = [
91
+ 'research', 'study', 'analysis', 'method', 'result',
92
+ 'conclusion', 'approach', 'framework', 'model',
93
+ 'data', 'experiment', 'evaluation', 'performance'
94
+ ]
95
+
96
+ text_lower = text.lower()
97
+ marker_count = sum(1 for marker in academic_markers if marker in text_lower)
98
+
99
+ return marker_count >= 1 # At least one academic marker
100
+
101
+ def clean_text(self, text: str) -> str:
102
+ """Clean text while preserving academic formatting"""
103
+ # Remove extra whitespace but preserve paragraph breaks
104
+ text = re.sub(r' +', ' ', text) # Multiple spaces to single
105
+ text = re.sub(r'\n\s*\n', '\n\n', text) # Clean paragraph breaks
106
+ text = text.strip()
107
+ return text
108
+ """
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FILE 1: requirements.txt
2
+ # ========================
3
+ REQUIREMENTS_TXT = """
4
+ streamlit>=1.28.0
5
+ transformers>=4.35.0
6
+ torch>=2.0.0
7
+ sentence-transformers>=2.2.2
8
+ nltk>=3.8
9
+ spacy>=3.7.0
10
+ scikit-learn>=1.3.0
11
+ numpy>=1.24.0
12
+ pandas>=2.0.0
13
+ python-docx>=0.8.11
14
+ PyMuPDF>=1.23.0
15
+ language-tool-python>=2.7.1
16
+ textblob>=0.17.1
17
+ huggingface-hub>=0.17.0
18
+ accelerate>=0.24.0
19
+ ""