Spaces:

amougou-fortiss
/

nlp-preprocessor

Sleeping

File size: 3,805 Bytes

import spacy

from flask import Flask, request, jsonify
from langchain.text_splitter import RecursiveCharacterTextSplitter

app = Flask(__name__)
nlp = spacy.load("de_core_news_sm")

@app.route('/')
def index():
    return '''
        <form action="/reverse" method="post">
            Enter text: <input name="text" />
            <input type="submit" />
        </form>
    '''

@app.route('/reverse', methods=['POST'])
def reverse():
    text = request.form.get('text', '')
    return f"<p>Reversed: {text[::-1]}</p><a href='/'>Try again</a>"

@app.route('/preprocess_text_with_nlp_llm', methods=['POST'])
def preprocess_text_with_nlp_llm(max_chunk_size=512, overlap=50):
    text = request.form.get('text', '')
    doc = nlp(text)
    # Enhanced tokenization and lemmatization with POS tags
    tokens_and_lemmas = [
        {
            "token": token.text,
            "lemma": token.lemma_,
            "pos": token.pos_,
            "dep": token.dep_,  # Dependency parsing
            "is_stop": token.is_stop,
        }
        for token in doc
        if not token.is_punct
    ]
    # Enhanced named entity recognition with additional metadata
    entities = [
        {
            "text": ent.text,
            "label": ent.label_,
            "start_char": ent.start_char,
            "end_char": ent.end_char,
            "description": spacy.explain(ent.label_),  # Get explanation of entity type
        }
        for ent in doc.ents
    ]
    # Extract key phrases and noun chunks
    noun_chunks = [
        {"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_}
        for chunk in doc.noun_chunks
    ]
    preprocessed_data = {
        "tokens_and_lemmas": tokens_and_lemmas,
        "entities": entities,
        "noun_chunks": noun_chunks,
        "text": text,
    }
    # Split while preserving page and line markers
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_chunk_size,
        chunk_overlap=overlap,
        separators=["\n[PAGE", "\n", " "],
    )
    chunks = splitter.split_text(text)
    return jsonify({'chunks': chunks, 'preprocessed_data': preprocessed_data})

@app.route('/preprocess_text_with_nlp_pymupdf', methods=['POST'])
def preprocess_text_with_nlp_pymupdf():
    """Enhanced NLP preprocessing identical to your first experiment using PyMuPDF text extraction"""
    text = request.form.get('text', '')
    doc = nlp(text)

    # Tokenization, lemmatization, and POS tagging
    tokens_and_lemmas = [
        {
            "token": token.text,
            "lemma": token.lemma_,
            "pos": token.pos_,
            "dep": token.dep_,
            "is_stop": token.is_stop,
        }
        for token in doc
        if not token.is_punct
    ]

    # Named entity recognition
    entities = [
        {
            "text": ent.text,
            "label": ent.label_,
            "start_char": ent.start_char,
            "end_char": ent.end_char,
            "description": spacy.explain(ent.label_),
        }
        for ent in doc.ents
    ]

    # Noun chunks
    noun_chunks = [
        {"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_}
        for chunk in doc.noun_chunks
    ]

    return jsonify({
        "tokens_and_lemmas": tokens_and_lemmas,
        "entities": entities,
        "noun_chunks": noun_chunks,
        "text": text,
    })

@app.route('/recursive_character_text_splitter', methods=['POST'])
def recursive_character_text_splitter():
    text = request.form.get('text', '')
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=512, chunk_overlap=50, separators=["\n[PAGE", "\n", " "]
    )
    chunks = splitter.split_text(text)
    return jsonify({"chunks": chunks})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=7860)