Spaces:
Sleeping
Sleeping
| from flask import Flask, request, jsonify, send_file | |
| from flask_cors import CORS | |
| import pickle | |
| from modules.google_cloud_ocr.google_cloud_ocr import google_cloud_ocr | |
| from modules.deed_preprocessing.spellcheck import correct_spelling | |
| from modules.deed_preprocessing.preprocessor import preprocess_text | |
| from modules.openai.racist_chatgpt_analysis import racist_chatgpt_analysis | |
| from modules.model_experimentation.bag_of_words_logistic_regression import predict | |
| import pandas as pd | |
| import xlsxwriter | |
| import re | |
| app = Flask(__name__) | |
| # CORS(app, resources={r"/*": {"origins": "*"}}) | |
| CORS(app, supports_credentials=True, origins="*") | |
| with open('modules/model_experimentation/vectorizer.pkl', 'rb') as vec_file: | |
| vectorizer = pickle.load(vec_file) | |
| with open('modules/model_experimentation/logistic_model.pkl', 'rb') as model_file: | |
| logistic_model = pickle.load(model_file) | |
| # Helper to look for the book and page numbers | |
| def extract_book_and_page(text): | |
| book_numbers = re.findall(r"book\s+(\d+)", text, re.IGNORECASE) | |
| page_numbers = re.findall(r"page\s+(\d+)", text, re.IGNORECASE) | |
| return book_numbers, page_numbers | |
| def upload_file(): | |
| if 'file' not in request.files: | |
| return jsonify({'error': 'No file part in the request'}), 400 | |
| file = request.files['file'] | |
| if file.filename == '': | |
| return jsonify({'error': 'No selected file'}), 400 | |
| ocr_engine = request.form.get('ocr_engine', 'google') | |
| analysis_method = request.form.get('analysis_method', 'chatgpt') | |
| try: | |
| if ocr_engine == 'google': | |
| # Step 1: Get text using Google OCR | |
| google_text = google_cloud_ocr(file) | |
| # Step 2: Pass text through the spell checker | |
| spellchecked_text = correct_spelling(google_text) | |
| # Step 3: Pass text through the preprocessor | |
| processed_text = preprocess_text(spellchecked_text) | |
| # Extract book and page numbers right after spellchecking | |
| book_numbers, page_numbers = extract_book_and_page(spellchecked_text) | |
| # Step 4: Get the names and locations | |
| extracted_info = { | |
| "names": processed_text.get("names", []), | |
| "locations": processed_text.get("locations", []), | |
| "book_numbers": book_numbers, | |
| "page_numbers": page_numbers | |
| } | |
| # Step 5: Choose analysis method | |
| if analysis_method == 'chatgpt': | |
| analysis_result = racist_chatgpt_analysis(processed_text['original_text']) | |
| return jsonify({ | |
| 'status': 'success', | |
| 'ocr_engine': 'google', | |
| 'analysis_method': 'chatgpt', | |
| 'original_text': google_text, | |
| 'spellchecked_text': spellchecked_text, | |
| 'processed_text': processed_text, | |
| 'extracted_info': extracted_info, | |
| 'result': analysis_result | |
| }), 200 | |
| elif analysis_method == 'logistic_regression': | |
| lr_result = predict(processed_text, vectorizer, logistic_model)['is_racist'] | |
| return jsonify({ | |
| 'status': 'success', | |
| 'ocr_engine': 'google', | |
| 'analysis_method': 'logistic_regression', | |
| 'original_text': google_text, | |
| 'spellchecked_text': spellchecked_text, | |
| 'processed_text': processed_text, | |
| 'extracted_info': extracted_info, | |
| 'result': lr_result | |
| }), 200 | |
| else: | |
| return jsonify({'error': 'Unsupported analysis method selected'}), 400 | |
| elif ocr_engine == 'azure': | |
| return jsonify({'status': 'success', 'ocr_engine': 'azure', 'text': "fill"}), 200 | |
| else: | |
| return jsonify({'error': 'Unsupported OCR engine selected'}), 400 | |
| except Exception as e: | |
| return jsonify({'error': str(e)}), 500 | |
| def download_excel(): | |
| try: | |
| data = request.get_json() | |
| if not data: | |
| return jsonify({'error': 'No data provided'}), 400 | |
| df = pd.DataFrame(data) | |
| excel_path = 'output.xlsx' | |
| with pd.ExcelWriter(excel_path, engine='xlsxwriter') as writer: | |
| df.to_excel(writer, index=False, sheet_name='Sheet1') | |
| return send_file(excel_path, as_attachment=True, download_name='analysis_results.xlsx') | |
| except Exception as e: | |
| return jsonify({'error': str(e)}), 500 | |
| def health_check(): | |
| return jsonify({'status': 'running', 'message': 'Flask app is up and running'}), 200 | |
| if __name__ == '__main__': | |
| app.run(debug=True, host="0.0.0.0", port=7860) | |