RedPanda-CPP/RedPandaIDE/parser/cpptokenizer.cpp

700 lines
17 KiB
C++

/*
* Copyright (C) 2020-2022 Roy Qu (royqh1979@gmail.com)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
#include "cpptokenizer.h"
#include <QFile>
#include <QTextStream>
CppTokenizer::CppTokenizer()
{
}
void CppTokenizer::reset()
{
mTokenList.clear();
mBuffer.clear();
mBufferStr.clear();
}
void CppTokenizer::tokenize(const QStringList &buffer)
{
reset();
mBuffer = buffer;
if (mBuffer.isEmpty())
return;
mBufferStr = mBuffer[0];
for (int i=1;i<mBuffer.size();i++) {
mBufferStr+='\n';
mBufferStr+=mBuffer[i];
}
mStart = mBufferStr.data();
mCurrent = mStart;
mLineCount = mStart;
QString s = "";
bool bSkipBlocks = false;
mCurrentLine = 1;
while (true) {
mLastToken = s;
s = getNextToken(true, true, bSkipBlocks);
simplify(s);
if (s.isEmpty())
break;
else
addToken(s,mCurrentLine);
}
}
void CppTokenizer::dumpTokens(const QString &fileName)
{
QFile file(fileName);
if (file.open(QIODevice::WriteOnly | QIODevice::Truncate)) {
QTextStream stream(&file);
foreach (const PToken& token,mTokenList) {
stream<<QString("%1,%2").arg(token->line).arg(token->text)<<Qt::endl;
}
}
}
const CppTokenizer::TokenList &CppTokenizer::tokens()
{
return mTokenList;
}
CppTokenizer::PToken CppTokenizer::operator[](int i)
{
return mTokenList[i];
}
int CppTokenizer::tokenCount()
{
return mTokenList.count();
}
void CppTokenizer::addToken(const QString &sText, int iLine)
{
PToken token = std::make_shared<Token>();
token->text = sText;
token->line = iLine;
mTokenList.append(token);
}
void CppTokenizer::countLines()
{
while ((*mLineCount != 0) && (mLineCount < mCurrent)) {
if (*mLineCount == '\n')
mCurrentLine ++;
mLineCount++;
}
}
QString CppTokenizer::getArguments()
{
QChar* offset = mCurrent;
skipPair('(', ')');
QString result(offset,mCurrent-offset);
simplifyArgs(result);
if ((*mCurrent == '.') || ((*mCurrent == '-') && (*(mCurrent + 1) == '>'))) {
// skip '.' and '->'
while ( !( *mCurrent == 0
|| *mCurrent == '('
|| *mCurrent == ';'
|| *mCurrent == '{'
|| *mCurrent == '}'
|| *mCurrent == ')'
|| isLineChar(*mCurrent)
|| isSpaceChar(*mCurrent)) )
mCurrent++;
}
skipToNextToken();
return result;
}
QString CppTokenizer::getForInit()
{
QChar* startOffset = mCurrent;
// Step into the init statement
mCurrent++;
// Process until ; or end of file
while (true) {
QString s = getNextToken(true, true, false);
simplify(s);
if (!s.isEmpty())
addToken(s,mCurrentLine);
if ( (s == "") || (s == ";") || (s==":"))
break;
// : is used in for-each loop
}
// Skip to end of for loop
mCurrent = startOffset;
skipPair('(', ')');
return "";
}
QString CppTokenizer::getNextToken(bool /* bSkipParenthesis */, bool bSkipArray, bool bSkipBlock)
{
QString result;
bool done = false;
while (true) {
skipToNextToken();
if (*mCurrent == 0)
break;
if (isPreprocessor()) {
countLines();
result = getPreprocessor(); // don't count preprocessor lines
if (result.startsWith("#include")) { // if we find
int delimPos = result.lastIndexOf(':');
if (delimPos >= 0) {
bool ok;
mCurrentLine = result.midRef(delimPos+1).toInt(&ok)-1; // fCurrLine is 0 based
}
}
done = (result != "");
} else if (isForInit()) {
countLines();
result = getForInit();
done = (result != "");
} else if (isArguments()) {
countLines();
result = getArguments();
done = (result != "");
} else if (isWord()) {
countLines();
result = getWord(false, bSkipArray, bSkipBlock);
done = (result != "");
} else if (isNumber()) {
countLines();
result = getNumber();
done = (result != "");
} else {
switch((*mCurrent).unicode()) {
case 0:
done = true;
break;
case '/':
advance();
break;
case '{':
case '}':
case ';':
case ',':
case ':': //just return the brace or the ';'
countLines();
result = *mCurrent;
advance();
done = true;
break;
case '>': // keep stream operators
if (*(mCurrent + 1) == '>') {
countLines();
result = ">>";
advance();
done = true;
} else
advance();
break;
case '<':
if (*(mCurrent + 1) == '<') {
countLines();
result = "<<";
advance();
done = true;
} else
advance();
break;
default:
advance();
}
}
if (done)
break;
}
return result;
}
QString CppTokenizer::getNumber()
{
QChar* offset = mCurrent;
if (isDigitChar(*mCurrent)) {
while (isDigitChar(*mCurrent) || isHexChar(*mCurrent)) {
advance();
}
}
QString result;
if (offset != mCurrent) {
result = QString(offset,mCurrent-offset);
if (*mCurrent=='.') // keep '.' for decimal
result += *mCurrent;
}
return result;
}
QString CppTokenizer::getPreprocessor()
{
QChar *offset = mCurrent;
skipToEOL();
return QString(offset, mCurrent-offset);
}
QString CppTokenizer::getWord(bool bSkipParenthesis, bool bSkipArray, bool bSkipBlock)
{
bool bFoundTemplate = false;
// bIsSmartPointer:=False;
// Skip spaces
skipToNextToken();
// Get next word...
QChar* offset = mCurrent;
// Copy the word ahead of us
while (isLetterChar(*mCurrent) || isDigitChar(*mCurrent))
mCurrent++;
QString currentWord;
if (offset != mCurrent) {
currentWord = QString(offset,mCurrent-offset);
}
// Append the operator characters and argument list to the operator word
if ((currentWord == "operator") ||
(currentWord == "operator*") ||
(currentWord == "operator&")) {
// Spaces between 'operator' and the operator itself are allowed
while (isSpaceChar(*mCurrent))
mCurrent++;
// Find end of operator
while (isOperatorChar(*mCurrent))
mCurrent++;
currentWord = QString(offset,mCurrent-offset);
} else if (currentWord == "template") {
bFoundTemplate = true;
}
QString result;
// We found a word...
if (!currentWord.isEmpty()) {
result = currentWord;
// Skip whitespace
skipToNextToken();
// Skip template contents, but keep template variable types
if (*mCurrent == '<') {
offset = mCurrent; //we don't skip
skipTemplateArgs();
if (!bFoundTemplate) {
result += QString(offset, mCurrent-offset);
skipToNextToken();
}
} else if (bSkipArray && (*mCurrent == '[')) {
// Append array stuff
while(true) {
offset = mCurrent;
skipPair('[', ']');
result += QString(offset,mCurrent-offset);
simplifyArgs(result);
skipToNextToken();
if (*mCurrent!='[') //maybe multi-dimension array
break;
}
} else if (bSkipBlock && (*mCurrent == '{')) {
skipPair('{', '}');
skipToNextToken();
}
// Keep parent/child operators
if (*mCurrent == '.') {
result+=*mCurrent;
mCurrent++;
} else if ((*mCurrent == '-') && (*(mCurrent + 1) == '>')) {
result+=QString(mCurrent,2);
mCurrent+=2;
} else if ((*mCurrent == ':') && (*(mCurrent + 1) == ':')) {
result+=QString(mCurrent,2);
mCurrent+=2;
// Append next token to this one
QString s = getWord(bSkipParenthesis, bSkipArray, bSkipBlock);
result += s;
}
}
return result;
}
bool CppTokenizer::isArguments()
{
return *mCurrent == '(';
}
bool CppTokenizer::isForInit()
{
return (*mCurrent == '(') && (mLastToken == "for");
}
bool CppTokenizer::isNumber()
{
return isDigitChar(*mCurrent);
}
bool CppTokenizer::isPreprocessor()
{
return *mCurrent=='#';
}
bool CppTokenizer::isWord()
{
bool result = isLetterChar(*mCurrent);
if (result && (*(mCurrent+1) == '"'))
result = false;
return result;
}
void CppTokenizer::simplify(QString &output)
{
//remove \n \r;
QString temp;
for (const QChar& ch:output) {
if (!isLineChar(ch))
temp+=ch;
}
output = temp.trimmed();
}
void CppTokenizer::simplifyArgs(QString &output)
{
QString temp;
QString lastSpace = "";
bool parentheseStart = true;
foreach (const QChar& ch,output.trimmed()) {
if (isSpaceChar(ch)) {
if (!parentheseStart)
lastSpace+=ch;
} else if (ch==','){
temp+=ch;
lastSpace = "";
parentheseStart = false;
} else if (ch=='(') {
temp+=ch;
lastSpace = "";
parentheseStart=true;
} else if (ch==')') {
temp+=ch;
lastSpace = "";
parentheseStart = false;
} else {
parentheseStart=false;
if (!lastSpace.isEmpty()) {
temp+=" ";
}
lastSpace = "";
temp+=ch;
}
}
output = temp;
}
void CppTokenizer::skipAssignment()
{
while (true) {
switch ((*mCurrent).unicode()) {
case '(': skipPair('(', ')');
break;
case '"': skipDoubleQuotes();
break;
case '\'': skipSingleQuote();
break;
case '{': skipPair('{', '}'); // support struct initializers
break;
case '/':
mCurrent++;
break;
default:
if ((*mCurrent == 'R') && (*(mCurrent+1) == '"'))
skipRawString();
else
mCurrent++;
}
if (*mCurrent == ','
|| *mCurrent ==';'
|| *mCurrent ==')'
|| *mCurrent =='}'
|| *mCurrent ==0)
break;
}
}
void CppTokenizer::skipDoubleQuotes()
{
mCurrent++;
while (!(*mCurrent=='"' || *mCurrent == 0)) {
if (*mCurrent == '\\')
mCurrent+=2; // skip escaped char
else
mCurrent++;
}
if (*mCurrent!=0) {
mCurrent++;
}
}
void CppTokenizer::skipPair(const QChar &cStart, const QChar cEnd, const QSet<QChar>& failChars)
{
mCurrent++;
while (*mCurrent != 0) {
if ((*mCurrent == '(') && !failChars.contains('(')) {
skipPair('(', ')', failChars);
} else if ((*mCurrent == '[') && !failChars.contains('[')) {
skipPair('[', ']', failChars);
} else if ((*mCurrent == '{') && !failChars.contains('{')) {
skipPair('{', '}', failChars);
} else if (*mCurrent == cStart) {
skipPair(cStart, cEnd, failChars);
} else if (*mCurrent == cEnd) {
mCurrent++; // skip over end
break;
} else if ((*mCurrent == 'R') && (*(mCurrent+1) == '"')) {
if (cStart != '\'' && cStart!='\"')
skipRawString(); // don't do it inside AnsiString!
else
mCurrent++;
} else if (*mCurrent == '"') {
if (cStart != '\'' && cStart!='\"')
skipDoubleQuotes(); // don't do it inside AnsiString!
else
mCurrent++;
} else if (*mCurrent == '\'') {
if (cStart != '\'' && cStart!='\"')
skipSingleQuote(); // don't do it inside AnsiString!
else
mCurrent++;
} else if (failChars.contains(*mCurrent)) {
break;
} else {
mCurrent++;
}
}
}
void CppTokenizer::skipRawString()
{
mCurrent++; //skip R
bool noEscape = false;
while(true) {
mCurrent++;
switch(mCurrent->unicode()) {
case '(':
noEscape = true;
break;
case ')':
noEscape = false;
break;
}
if (*mCurrent == 0)
break;
if ((*mCurrent == '"') && !noEscape)
break;
}
if (*mCurrent!=0)
mCurrent++;
}
void CppTokenizer::skipSingleQuote()
{
mCurrent++;
while (!(*mCurrent=='\'' || *mCurrent == 0)) {
if (*mCurrent == '\\')
mCurrent+=2; // skip escaped char
else
mCurrent++;
}
if (*mCurrent!=0) {
mCurrent++;
}
}
void CppTokenizer::skipSplitLine()
{
mCurrent++; // skip '\'
while ( isLineChar(*mCurrent)) // skip newline
mCurrent++;
}
void CppTokenizer::skipTemplateArgs()
{
if (*mCurrent != '<')
return;
QChar* start = mCurrent;
QSet<QChar> failSet;
failSet.insert('{');
failSet.insert('}');
failSet.insert(';');
skipPair('<', '>', failSet);
// if we failed, return to where we came from
if (start!=mCurrent && *(mCurrent - 1) != '>')
mCurrent = start;
}
void CppTokenizer::skipToEOL()
{
while (true) {
while (!isLineChar(*mCurrent) && (*mCurrent!=0)) {
mCurrent++;
}
if (*mCurrent==0)
return;
bool splitLine = (*(mCurrent - 1) == '\\');
while (isLineChar(*mCurrent))
mCurrent++;
if (!splitLine || *mCurrent==0)
break;
}
}
void CppTokenizer::skipToNextToken()
{
while (isSpaceChar(*mCurrent) || isLineChar(*mCurrent))
advance();
}
void CppTokenizer::advance()
{
switch(mCurrent->unicode()) {
case '\"': skipDoubleQuotes();
break;
case '\'': skipSingleQuote();
break;
case '/':
if (*(mCurrent + 1) == '=')
skipAssignment();
else
mCurrent++;
break;
case '=': {
if (mTokenList.size()>2
&& mTokenList[mTokenList.size()-2]->text == "using") {
addToken("=",mCurrentLine);
mCurrent++;
} else
skipAssignment();
break;
}
case '&':
case '*':
case '!':
case '|':
case '+':
case '-':
case '~':
if (*(mCurrent + 1) == '=')
skipAssignment();
else
mCurrent++;
break;
case '\\':
if (isLineChar(*(mCurrent + 1)))
skipSplitLine();
else
mCurrent++;
break;
default:
if ((*mCurrent == 'R') && (*(mCurrent+1) == '"'))
skipRawString();
else
mCurrent++;
}
}
bool CppTokenizer::isLetterChar(const QChar &ch)
{
// return (ch>= 'A' && ch<='Z')
// || (ch>='a' && ch<='z')
return ch.isLetter()
|| ch == '_'
|| ch == '*'
|| ch == '&'
|| ch == '~';
}
bool CppTokenizer::isHexChar(const QChar &ch)
{
return (ch >= 'A' && ch<='F')
|| (ch>='a' && ch<='f')
|| ch == 'x'
|| ch == 'L';
}
bool CppTokenizer::isDigitChar(const QChar &ch)
{
return (ch>='0' && ch<='9');
}
bool CppTokenizer::isSpaceChar(const QChar &ch)
{
return (ch == ' ' || ch == '\t');
}
bool CppTokenizer::isLineChar(const QChar &ch)
{
return (ch=='\n' || ch=='\r');
}
bool CppTokenizer::isBlankChar(const QChar &ch)
{
return (ch<=32);
}
bool CppTokenizer::isOperatorChar(const QChar &ch)
{
switch (ch.unicode()) {
case '+':
case '-':
case '/':
case '*':
case '[':
case ']':
case '=':
case '%':
case '!':
case '&':
case '|':
case '>':
case '<':
case '^':
return true;
default:
return false;
}
}
bool CppTokenizer::currentWordEquals(QChar *wordStart, QChar *wordEnd, const QString& text)
{
QString currentWord(wordStart, wordEnd-wordStart);
return currentWord == text;
}