amougou-fortiss's picture
Update app.py
bb66f7f verified
import spacy
from flask import Flask, request, jsonify
from langchain.text_splitter import RecursiveCharacterTextSplitter
app = Flask(__name__)
nlp = spacy.load("de_core_news_sm")
@app.route('/')
def index():
return '''
<form action="/reverse" method="post">
Enter text: <input name="text" />
<input type="submit" />
</form>
'''
@app.route('/reverse', methods=['POST'])
def reverse():
text = request.form.get('text', '')
return f"<p>Reversed: {text[::-1]}</p><a href='/'>Try again</a>"
@app.route('/preprocess_text_with_nlp_llm', methods=['POST'])
def preprocess_text_with_nlp_llm(max_chunk_size=512, overlap=50):
text = request.form.get('text', '')
doc = nlp(text)
# Enhanced tokenization and lemmatization with POS tags
tokens_and_lemmas = [
{
"token": token.text,
"lemma": token.lemma_,
"pos": token.pos_,
"dep": token.dep_, # Dependency parsing
"is_stop": token.is_stop,
}
for token in doc
if not token.is_punct
]
# Enhanced named entity recognition with additional metadata
entities = [
{
"text": ent.text,
"label": ent.label_,
"start_char": ent.start_char,
"end_char": ent.end_char,
"description": spacy.explain(ent.label_), # Get explanation of entity type
}
for ent in doc.ents
]
# Extract key phrases and noun chunks
noun_chunks = [
{"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_}
for chunk in doc.noun_chunks
]
preprocessed_data = {
"tokens_and_lemmas": tokens_and_lemmas,
"entities": entities,
"noun_chunks": noun_chunks,
"text": text,
}
# Split while preserving page and line markers
splitter = RecursiveCharacterTextSplitter(
chunk_size=max_chunk_size,
chunk_overlap=overlap,
separators=["\n[PAGE", "\n", " "],
)
chunks = splitter.split_text(text)
return jsonify({'chunks': chunks, 'preprocessed_data': preprocessed_data})
@app.route('/preprocess_text_with_nlp_pymupdf', methods=['POST'])
def preprocess_text_with_nlp_pymupdf():
"""Enhanced NLP preprocessing identical to your first experiment using PyMuPDF text extraction"""
text = request.form.get('text', '')
doc = nlp(text)
# Tokenization, lemmatization, and POS tagging
tokens_and_lemmas = [
{
"token": token.text,
"lemma": token.lemma_,
"pos": token.pos_,
"dep": token.dep_,
"is_stop": token.is_stop,
}
for token in doc
if not token.is_punct
]
# Named entity recognition
entities = [
{
"text": ent.text,
"label": ent.label_,
"start_char": ent.start_char,
"end_char": ent.end_char,
"description": spacy.explain(ent.label_),
}
for ent in doc.ents
]
# Noun chunks
noun_chunks = [
{"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_}
for chunk in doc.noun_chunks
]
return jsonify({
"tokens_and_lemmas": tokens_and_lemmas,
"entities": entities,
"noun_chunks": noun_chunks,
"text": text,
})
@app.route('/recursive_character_text_splitter', methods=['POST'])
def recursive_character_text_splitter():
text = request.form.get('text', '')
splitter = RecursiveCharacterTextSplitter(
chunk_size=512, chunk_overlap=50, separators=["\n[PAGE", "\n", " "]
)
chunks = splitter.split_text(text)
return jsonify({"chunks": chunks})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=7860)