|
|
#pragma once |
|
|
|
|
|
#include "Vocabulary.h" |
|
|
|
|
|
class SuffixArray |
|
|
{ |
|
|
public: |
|
|
typedef unsigned int INDEX; |
|
|
|
|
|
private: |
|
|
WORD_ID *m_array; |
|
|
INDEX *m_index; |
|
|
INDEX *m_buffer; |
|
|
char *m_wordInSentence; |
|
|
INDEX *m_sentence; |
|
|
char *m_sentenceLength; |
|
|
WORD_ID m_endOfSentence; |
|
|
INDEX *m_document; |
|
|
INDEX *m_documentName; |
|
|
char *m_documentNameBuffer; |
|
|
size_t m_documentNameLength; |
|
|
size_t m_documentCount; |
|
|
bool m_useDocument; |
|
|
Vocabulary m_vcb; |
|
|
INDEX m_size; |
|
|
INDEX m_sentenceCount; |
|
|
|
|
|
|
|
|
SuffixArray(const SuffixArray&); |
|
|
void operator=(const SuffixArray&); |
|
|
|
|
|
public: |
|
|
SuffixArray(); |
|
|
~SuffixArray(); |
|
|
|
|
|
void Create(const std::string& fileName ); |
|
|
bool ProcessDocumentLine( const char* const, const size_t ); |
|
|
void Sort(INDEX start, INDEX end); |
|
|
int CompareIndex( INDEX a, INDEX b ) const; |
|
|
inline int CompareWord( WORD_ID a, WORD_ID b ) const; |
|
|
int Count( const std::vector< WORD > &phrase ); |
|
|
bool MinCount( const std::vector< WORD > &phrase, INDEX min ); |
|
|
bool Exists( const std::vector< WORD > &phrase ); |
|
|
int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 ); |
|
|
int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 ); |
|
|
INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end ); |
|
|
INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction ); |
|
|
int Match( const std::vector< WORD > &phrase, INDEX index ); |
|
|
void List( INDEX start, INDEX end ); |
|
|
void PrintSentenceMatches( const std::vector< WORD > &phrase ); |
|
|
inline INDEX GetPosition( INDEX index ) const { |
|
|
return m_index[ index ]; |
|
|
} |
|
|
inline INDEX GetSentence( INDEX position ) const { |
|
|
return m_sentence[position]; |
|
|
} |
|
|
inline char GetWordInSentence( INDEX position ) const { |
|
|
return m_wordInSentence[position]; |
|
|
} |
|
|
inline char GetSentenceLength( INDEX sentenceId ) const { |
|
|
return m_sentenceLength[sentenceId]; |
|
|
} |
|
|
inline INDEX GetSize() const { |
|
|
return m_size; |
|
|
} |
|
|
inline WORD GetWord( INDEX position ) const { |
|
|
return m_vcb.GetWord( m_array[position] ); |
|
|
} |
|
|
void UseDocument() { |
|
|
m_useDocument = true; |
|
|
} |
|
|
INDEX GetDocument( INDEX sentence ) const; |
|
|
void PrintDocumentName( INDEX document ) { |
|
|
for(INDEX i=m_documentName[ document ]; m_documentNameBuffer[i] != 0; i++) { |
|
|
std::cout << m_documentNameBuffer[ i ]; |
|
|
} |
|
|
} |
|
|
void Save(const std::string& fileName ) const; |
|
|
void Load(const std::string& fileName ); |
|
|
void CheckAllocation(bool, const char *dataStructure) const; |
|
|
bool Error( const char* message, const std::string& fileName) const; |
|
|
}; |
|
|
|