Spaces:
Sleeping
Sleeping
File size: 3,805 Bytes
44c4e3b be882a8 44c4e3b be882a8 44c4e3b be882a8 44c4e3b bb66f7f 44c4e3b 78b7166 44c4e3b 78b7166 44c4e3b 78b7166 44c4e3b be882a8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | import spacy
from flask import Flask, request, jsonify
from langchain.text_splitter import RecursiveCharacterTextSplitter
app = Flask(__name__)
nlp = spacy.load("de_core_news_sm")
@app.route('/')
def index():
return '''
<form action="/reverse" method="post">
Enter text: <input name="text" />
<input type="submit" />
</form>
'''
@app.route('/reverse', methods=['POST'])
def reverse():
text = request.form.get('text', '')
return f"<p>Reversed: {text[::-1]}</p><a href='/'>Try again</a>"
@app.route('/preprocess_text_with_nlp_llm', methods=['POST'])
def preprocess_text_with_nlp_llm(max_chunk_size=512, overlap=50):
text = request.form.get('text', '')
doc = nlp(text)
# Enhanced tokenization and lemmatization with POS tags
tokens_and_lemmas = [
{
"token": token.text,
"lemma": token.lemma_,
"pos": token.pos_,
"dep": token.dep_, # Dependency parsing
"is_stop": token.is_stop,
}
for token in doc
if not token.is_punct
]
# Enhanced named entity recognition with additional metadata
entities = [
{
"text": ent.text,
"label": ent.label_,
"start_char": ent.start_char,
"end_char": ent.end_char,
"description": spacy.explain(ent.label_), # Get explanation of entity type
}
for ent in doc.ents
]
# Extract key phrases and noun chunks
noun_chunks = [
{"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_}
for chunk in doc.noun_chunks
]
preprocessed_data = {
"tokens_and_lemmas": tokens_and_lemmas,
"entities": entities,
"noun_chunks": noun_chunks,
"text": text,
}
# Split while preserving page and line markers
splitter = RecursiveCharacterTextSplitter(
chunk_size=max_chunk_size,
chunk_overlap=overlap,
separators=["\n[PAGE", "\n", " "],
)
chunks = splitter.split_text(text)
return jsonify({'chunks': chunks, 'preprocessed_data': preprocessed_data})
@app.route('/preprocess_text_with_nlp_pymupdf', methods=['POST'])
def preprocess_text_with_nlp_pymupdf():
"""Enhanced NLP preprocessing identical to your first experiment using PyMuPDF text extraction"""
text = request.form.get('text', '')
doc = nlp(text)
# Tokenization, lemmatization, and POS tagging
tokens_and_lemmas = [
{
"token": token.text,
"lemma": token.lemma_,
"pos": token.pos_,
"dep": token.dep_,
"is_stop": token.is_stop,
}
for token in doc
if not token.is_punct
]
# Named entity recognition
entities = [
{
"text": ent.text,
"label": ent.label_,
"start_char": ent.start_char,
"end_char": ent.end_char,
"description": spacy.explain(ent.label_),
}
for ent in doc.ents
]
# Noun chunks
noun_chunks = [
{"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_}
for chunk in doc.noun_chunks
]
return jsonify({
"tokens_and_lemmas": tokens_and_lemmas,
"entities": entities,
"noun_chunks": noun_chunks,
"text": text,
})
@app.route('/recursive_character_text_splitter', methods=['POST'])
def recursive_character_text_splitter():
text = request.form.get('text', '')
splitter = RecursiveCharacterTextSplitter(
chunk_size=512, chunk_overlap=50, separators=["\n[PAGE", "\n", " "]
)
chunks = splitter.split_text(text)
return jsonify({"chunks": chunks})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=7860)
|