Spaces:

amougou-fortiss
/

nlp-preprocessor

Sleeping

App Files Files Community

amougou-fortiss commited on Jul 22, 2025

Commit

44c4e3b

verified ·

1 Parent(s): ee45916

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -0

app.py CHANGED Viewed

@@ -1,6 +1,10 @@
 from flask import Flask, request, jsonify
 app = Flask(__name__)
 @app.route('/')
 def index():
@@ -16,5 +20,97 @@ def reverse():
     text = request.form.get('text', '')
     return f"<p>Reversed: {text[::-1]}</p><a href='/'>Try again</a>"
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=7860)

+import spacy
 from flask import Flask, request, jsonify
+from langchain.text_splitter import RecursiveCharacterTextSplitter
 app = Flask(__name__)
+nlp = spacy.load("de_core_news_sm")
 @app.route('/')
 def index():
     text = request.form.get('text', '')
     return f"<p>Reversed: {text[::-1]}</p><a href='/'>Try again</a>"
+@app.route('/preprocess_text_with_nlp_llm', methods=['POST'])
+def preprocess_text_with_nlp_llm():
+    text = request.form.get('text', '')
+    doc = nlp(text)
+    # Enhanced tokenization and lemmatization with POS tags
+    tokens_and_lemmas = [
+        {
+            "token": token.text,
+            "lemma": token.lemma_,
+            "pos": token.pos_,
+            "dep": token.dep_,  # Dependency parsing
+            "is_stop": token.is_stop,
+        }
+        for token in doc
+        if not token.is_punct
+    ]
+    # Enhanced named entity recognition with additional metadata
+    entities = [
+        {
+            "text": ent.text,
+            "label": ent.label_,
+            "start_char": ent.start_char,
+            "end_char": ent.end_char,
+            "description": spacy.explain(ent.label_),  # Get explanation of entity type
+        }
+        for ent in doc.ents
+    ]
+    # Extract key phrases and noun chunks
+    noun_chunks = [
+        {"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_}
+        for chunk in doc.noun_chunks
+    ]
+    preprocessed_data = {
+        "tokens_and_lemmas": tokens_and_lemmas,
+        "entities": entities,
+        "noun_chunks": noun_chunks,
+        "text": text,
+    }
+    # Split while preserving page and line markers
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=max_chunk_size,
+        chunk_overlap=overlap,
+        separators=["\n[PAGE", "\n", " "],
+    )
+    chunks = splitter.split_text(text)
+    return jsonify({'chunks': chunks, 'preprocessed_data': preprocessed_data)
+@app.route('/preprocess_text_with_nlp_pymupdf', methods=['POST'])
+def preprocess_text_with_nlp_pymupdf():
+    """Enhanced NLP preprocessing identical to your first experiment using PyMuPDF text extraction"""
+    text = request.form.get('text', '')
+    doc = nlp(text)
+    # Tokenization, lemmatization, and POS tagging
+    tokens_and_lemmas = [
+        {
+            "token": token.text,
+            "lemma": token.lemma_,
+            "pos": token.pos_,
+            "dep": token.dep_,
+            "is_stop": token.is_stop,
+        }
+        for token in doc
+        if not token.is_punct
+    ]
+    # Named entity recognition
+    entities = [
+        {
+            "text": ent.text,
+            "label": ent.label_,
+            "start_char": ent.start_char,
+            "end_char": ent.end_char,
+            "description": spacy.explain(ent.label_),
+        }
+        for ent in doc.ents
+    ]
+    # Noun chunks
+    noun_chunks = [
+        {"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_}
+        for chunk in doc.noun_chunks
+    ]
+    return {
+        "tokens_and_lemmas": tokens_and_lemmas,
+        "entities": entities,
+        "noun_chunks": noun_chunks,
+        "text": text,
+    }
 if __name__ == '__main__':
     app.run(host='0.0.0.0', port=7860)