Spaces:

amougou-fortiss
/

nlp-preprocessor

Sleeping

App Files Files Community

nlp-preprocessor / app.py

amougou-fortiss

Update app.py

bb66f7f verified 7 months ago

raw

history blame contribute delete

3.81 kB

	import spacy

	from flask import Flask, request, jsonify
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	app = Flask(__name__)
	nlp = spacy.load("de_core_news_sm")

	@app.route('/')
	def index():
	return '''
	<form action="/reverse" method="post">
	Enter text: <input name="text" />
	<input type="submit" />
	</form>
	'''

	@app.route('/reverse', methods=['POST'])
	def reverse():
	text = request.form.get('text', '')
	return f"<p>Reversed: {text[::-1]}</p><a href='/'>Try again</a>"

	@app.route('/preprocess_text_with_nlp_llm', methods=['POST'])
	def preprocess_text_with_nlp_llm(max_chunk_size=512, overlap=50):
	text = request.form.get('text', '')
	doc = nlp(text)
	# Enhanced tokenization and lemmatization with POS tags
	tokens_and_lemmas = [
	{
	"token": token.text,
	"lemma": token.lemma_,
	"pos": token.pos_,
	"dep": token.dep_, # Dependency parsing
	"is_stop": token.is_stop,
	}
	for token in doc
	if not token.is_punct
	]
	# Enhanced named entity recognition with additional metadata
	entities = [
	{
	"text": ent.text,
	"label": ent.label_,
	"start_char": ent.start_char,
	"end_char": ent.end_char,
	"description": spacy.explain(ent.label_), # Get explanation of entity type
	}
	for ent in doc.ents
	]
	# Extract key phrases and noun chunks
	noun_chunks = [
	{"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_}
	for chunk in doc.noun_chunks
	]
	preprocessed_data = {
	"tokens_and_lemmas": tokens_and_lemmas,
	"entities": entities,
	"noun_chunks": noun_chunks,
	"text": text,
	}
	# Split while preserving page and line markers
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=max_chunk_size,
	chunk_overlap=overlap,
	separators=["\n[PAGE", "\n", " "],
	)
	chunks = splitter.split_text(text)
	return jsonify({'chunks': chunks, 'preprocessed_data': preprocessed_data})

	@app.route('/preprocess_text_with_nlp_pymupdf', methods=['POST'])
	def preprocess_text_with_nlp_pymupdf():
	"""Enhanced NLP preprocessing identical to your first experiment using PyMuPDF text extraction"""
	text = request.form.get('text', '')
	doc = nlp(text)

	# Tokenization, lemmatization, and POS tagging
	tokens_and_lemmas = [
	{
	"token": token.text,
	"lemma": token.lemma_,
	"pos": token.pos_,
	"dep": token.dep_,
	"is_stop": token.is_stop,
	}
	for token in doc
	if not token.is_punct
	]

	# Named entity recognition
	entities = [
	{
	"text": ent.text,
	"label": ent.label_,
	"start_char": ent.start_char,
	"end_char": ent.end_char,
	"description": spacy.explain(ent.label_),
	}
	for ent in doc.ents
	]

	# Noun chunks
	noun_chunks = [
	{"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_}
	for chunk in doc.noun_chunks
	]

	return jsonify({
	"tokens_and_lemmas": tokens_and_lemmas,
	"entities": entities,
	"noun_chunks": noun_chunks,
	"text": text,
	})

	@app.route('/recursive_character_text_splitter', methods=['POST'])
	def recursive_character_text_splitter():
	text = request.form.get('text', '')
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=512, chunk_overlap=50, separators=["\n[PAGE", "\n", " "]
	)
	chunks = splitter.split_text(text)
	return jsonify({"chunks": chunks})

	if __name__ == '__main__':
	app.run(host='0.0.0.0', port=7860)