File size: 3,805 Bytes
44c4e3b
 
be882a8
44c4e3b
be882a8
 
44c4e3b
be882a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44c4e3b
bb66f7f
44c4e3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78b7166
44c4e3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78b7166
44c4e3b
 
 
 
78b7166
 
 
 
 
 
 
 
 
 
44c4e3b
be882a8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import spacy

from flask import Flask, request, jsonify
from langchain.text_splitter import RecursiveCharacterTextSplitter

app = Flask(__name__)
nlp = spacy.load("de_core_news_sm")

@app.route('/')
def index():
    return '''
        <form action="/reverse" method="post">
            Enter text: <input name="text" />
            <input type="submit" />
        </form>
    '''

@app.route('/reverse', methods=['POST'])
def reverse():
    text = request.form.get('text', '')
    return f"<p>Reversed: {text[::-1]}</p><a href='/'>Try again</a>"

@app.route('/preprocess_text_with_nlp_llm', methods=['POST'])
def preprocess_text_with_nlp_llm(max_chunk_size=512, overlap=50):
    text = request.form.get('text', '')
    doc = nlp(text)
    # Enhanced tokenization and lemmatization with POS tags
    tokens_and_lemmas = [
        {
            "token": token.text,
            "lemma": token.lemma_,
            "pos": token.pos_,
            "dep": token.dep_,  # Dependency parsing
            "is_stop": token.is_stop,
        }
        for token in doc
        if not token.is_punct
    ]
    # Enhanced named entity recognition with additional metadata
    entities = [
        {
            "text": ent.text,
            "label": ent.label_,
            "start_char": ent.start_char,
            "end_char": ent.end_char,
            "description": spacy.explain(ent.label_),  # Get explanation of entity type
        }
        for ent in doc.ents
    ]
    # Extract key phrases and noun chunks
    noun_chunks = [
        {"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_}
        for chunk in doc.noun_chunks
    ]
    preprocessed_data = {
        "tokens_and_lemmas": tokens_and_lemmas,
        "entities": entities,
        "noun_chunks": noun_chunks,
        "text": text,
    }
    # Split while preserving page and line markers
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=max_chunk_size,
        chunk_overlap=overlap,
        separators=["\n[PAGE", "\n", " "],
    )
    chunks = splitter.split_text(text)
    return jsonify({'chunks': chunks, 'preprocessed_data': preprocessed_data})

@app.route('/preprocess_text_with_nlp_pymupdf', methods=['POST'])
def preprocess_text_with_nlp_pymupdf():
    """Enhanced NLP preprocessing identical to your first experiment using PyMuPDF text extraction"""
    text = request.form.get('text', '')
    doc = nlp(text)

    # Tokenization, lemmatization, and POS tagging
    tokens_and_lemmas = [
        {
            "token": token.text,
            "lemma": token.lemma_,
            "pos": token.pos_,
            "dep": token.dep_,
            "is_stop": token.is_stop,
        }
        for token in doc
        if not token.is_punct
    ]

    # Named entity recognition
    entities = [
        {
            "text": ent.text,
            "label": ent.label_,
            "start_char": ent.start_char,
            "end_char": ent.end_char,
            "description": spacy.explain(ent.label_),
        }
        for ent in doc.ents
    ]

    # Noun chunks
    noun_chunks = [
        {"text": chunk.text, "root_text": chunk.root.text, "root_dep": chunk.root.dep_}
        for chunk in doc.noun_chunks
    ]

    return jsonify({
        "tokens_and_lemmas": tokens_and_lemmas,
        "entities": entities,
        "noun_chunks": noun_chunks,
        "text": text,
    })

@app.route('/recursive_character_text_splitter', methods=['POST'])
def recursive_character_text_splitter():
    text = request.form.get('text', '')
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=512, chunk_overlap=50, separators=["\n[PAGE", "\n", " "]
    )
    chunks = splitter.split_text(text)
    return jsonify({"chunks": chunks})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=7860)