import spacy from flask import Flask, request, jsonify from langchain.text_splitter import RecursiveCharacterTextSplitter app = Flask(__name__) nlp = spacy.load("de_core_news_sm") @app.route('/') def index(): return '''
Enter text:
''' @app.route('/reverse', methods=['POST']) def reverse(): text = request.form.get('text', '') return f"

Reversed: {text[::-1]}

Try again" @app.route('/preprocess_text_with_nlp_llm', methods=['POST']) def preprocess_text_with_nlp_llm(max_chunk_size=512, overlap=50): text = request.form.get('text', '') doc = nlp(text) # Enhanced tokenization and lemmatization with POS tags tokens_and_lemmas = [ { "token": token.text, "lemma": token.lemma_, "pos": token.pos_, "dep": token.dep_, # Dependency parsing "is_stop": token.is_stop, } for token in doc if not token.is_punct ] # Enhanced named entity recognition with additional metadata entities = [ { "text": ent.text, "label": ent.label_, "start_char": ent.start_char, "end_char": ent.end_char, "description": spacy.explain(ent.label_), # Get explanation of entity type } for ent in doc.ents ] # Extract key phrases and noun chunks noun_chunks = [ {"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_} for chunk in doc.noun_chunks ] preprocessed_data = { "tokens_and_lemmas": tokens_and_lemmas, "entities": entities, "noun_chunks": noun_chunks, "text": text, } # Split while preserving page and line markers splitter = RecursiveCharacterTextSplitter( chunk_size=max_chunk_size, chunk_overlap=overlap, separators=["\n[PAGE", "\n", " "], ) chunks = splitter.split_text(text) return jsonify({'chunks': chunks, 'preprocessed_data': preprocessed_data}) @app.route('/preprocess_text_with_nlp_pymupdf', methods=['POST']) def preprocess_text_with_nlp_pymupdf(): """Enhanced NLP preprocessing identical to your first experiment using PyMuPDF text extraction""" text = request.form.get('text', '') doc = nlp(text) # Tokenization, lemmatization, and POS tagging tokens_and_lemmas = [ { "token": token.text, "lemma": token.lemma_, "pos": token.pos_, "dep": token.dep_, "is_stop": token.is_stop, } for token in doc if not token.is_punct ] # Named entity recognition entities = [ { "text": ent.text, "label": ent.label_, "start_char": ent.start_char, "end_char": ent.end_char, "description": spacy.explain(ent.label_), } for ent in doc.ents ] # Noun chunks noun_chunks = [ {"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_} for chunk in doc.noun_chunks ] return jsonify({ "tokens_and_lemmas": tokens_and_lemmas, "entities": entities, "noun_chunks": noun_chunks, "text": text, }) @app.route('/recursive_character_text_splitter', methods=['POST']) def recursive_character_text_splitter(): text = request.form.get('text', '') splitter = RecursiveCharacterTextSplitter( chunk_size=512, chunk_overlap=50, separators=["\n[PAGE", "\n", " "] ) chunks = splitter.split_text(text) return jsonify({"chunks": chunks}) if __name__ == '__main__': app.run(host='0.0.0.0', port=7860)