/* * Copyright (C) 2020-2022 Roy Qu (royqh1979@gmail.com) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "cpptokenizer.h" #include #include #include CppTokenizer::CppTokenizer() { } void CppTokenizer::clear() { mTokenList.clear(); mBuffer.clear(); mBufferStr.clear(); mLastToken.clear(); mUnmatchedBraces.clear(); mUnmatchedBrackets.clear(); mUnmatchedParenthesis.clear(); mLambdas.clear(); } void CppTokenizer::tokenize(const QStringList &buffer) { clear(); mBuffer = buffer; if (mBuffer.isEmpty()) return; mBufferStr = mBuffer[0]; for (int i=1;iline).arg(token->text).arg(token->matchIndex) #if QT_VERSION >= QT_VERSION_CHECK(5,15,0) <(); token->text = sText; token->line = iLine; #ifdef Q_DEBUG token->matchIndex = 1000000000; #endif switch(tokenType) { case TokenType::LeftBrace: token->matchIndex=-1; mUnmatchedBraces.push_back(mTokenList.count()); break; case TokenType::RightBrace: if (mUnmatchedBraces.isEmpty()) { token->matchIndex=-1; } else { token->matchIndex = mUnmatchedBraces.last(); mTokenList[token->matchIndex]->matchIndex=mTokenList.count(); mUnmatchedBraces.pop_back(); } break; case TokenType::LeftBracket: token->matchIndex=-1; mUnmatchedBrackets.push_back(mTokenList.count()); break; case TokenType::RightBracket: if (mUnmatchedBrackets.isEmpty()) { token->matchIndex=-1; } else { token->matchIndex = mUnmatchedBrackets.last(); mTokenList[token->matchIndex]->matchIndex=mTokenList.count(); mUnmatchedBrackets.pop_back(); } break; case TokenType::LeftParenthesis: token->matchIndex=-1; mUnmatchedParenthesis.push_back(mTokenList.count()); break; case TokenType::RightParenthesis: if (mUnmatchedParenthesis.isEmpty()) { token->matchIndex=-1; } else { token->matchIndex = mUnmatchedParenthesis.last(); mTokenList[token->matchIndex]->matchIndex=mTokenList.count(); mUnmatchedParenthesis.pop_back(); } break; case TokenType::LambdaCaptures: mLambdas.push_back(mTokenList.count()); default: break; } mTokenList.append(token); } void CppTokenizer::countLines() { while ((*mLineCount != 0) && (mLineCount < mCurrent)) { if (*mLineCount == '\n') mCurrentLine ++; mLineCount++; } } QString CppTokenizer::getForInit() { QChar* startOffset = mCurrent; // Step into the init statement mCurrent++; TokenType tokenType; // Process until ; or end of file while (true) { QString s = getNextToken(&tokenType, true, false); simplify(s); if (!s.isEmpty()) addToken(s,mCurrentLine,tokenType); if ( (s == "") || (s == ";") || (s==":")) break; // : is used in for-each loop } // Skip to end of for loop mCurrent = startOffset; skipPair('(', ')'); return ""; } QString CppTokenizer::getNextToken(TokenType *pTokenType, bool bSkipArray, bool bSkipBlock) { QString result; int backupIndex; bool done = false; *pTokenType=TokenType::Normal; while (true) { skipToNextToken(); if (*mCurrent == 0) break; if (isPreprocessor()) { countLines(); result = getPreprocessor(); // don't count preprocessor lines if (result.startsWith("#include")) { // if we find int delimPos = result.lastIndexOf(':'); if (delimPos >= 0) { bool ok; mCurrentLine = result.midRef(delimPos+1).toInt(&ok)-1; // fCurrLine is 0 based } } done = (result != ""); } else if (isForInit()) { countLines(); result = getForInit(); done = (result != ""); // } else if (isArguments()) { // countLines(); // result = getArguments(); // done = (result != ""); } else if (isWord()) { countLines(); result = getWord(false, bSkipArray, bSkipBlock); // if (result=="noexcept" || result == "throw") { // result=""; // if (*mCurrent=='(') // skipPair('(',')'); // } done = (result != ""); } else if (isNumber()) { countLines(); result = getNumber(); done = (result != ""); } else { switch((*mCurrent).unicode()) { case 0: done = true; break; case ':': if (*(mCurrent + 1) == ':') { countLines(); mCurrent+=2; result = "::"; skipToNextToken(); // Append next token to this one if (isIdentChar(*mCurrent)) result+=getWord(true, bSkipArray, bSkipBlock); done = true; } else { countLines(); result = *mCurrent; mCurrent++; done = true; } break; case '{': *pTokenType=TokenType::LeftBrace; countLines(); result = *mCurrent; mCurrent++; done = true; break; case '}': *pTokenType=TokenType::RightBrace; countLines(); result = *mCurrent; mCurrent++; done = true; break; case '(': *pTokenType=TokenType::LeftParenthesis; countLines(); result = *mCurrent; mCurrent++; done = true; break; case '[': if (*(mCurrent+1)!='[') { *pTokenType=TokenType::LambdaCaptures; countLines(); QChar* backup=mCurrent; skipPair('[',']'); result = QString(backup,mCurrent-backup); done = true; } else { skipPair('[',']'); // attribute, skipit } break; case ')': *pTokenType=TokenType::RightParenthesis; countLines(); result = *mCurrent; mCurrent++; done = true; break; case ';': case ',': //just return the brace or the ';' countLines(); result = *mCurrent; mCurrent++; done = true; break; case '>': // keep stream operators if (*(mCurrent + 1) == '>') { countLines(); result = ">>"; mCurrent+=2; done = true; } else mCurrent+=1; break; case '<': if (*(mCurrent + 1) == '<') { countLines(); result = "<<"; mCurrent+=2; done = true; } else mCurrent+=1; break; case '=': { if (*(mCurrent+1)=='=') { // skip '==' skipAssignment(); } else { countLines(); mCurrent+=1; result = "="; done = true; } break; } break; case '!': if (*(mCurrent+1)=='=') { skipAssignment(); } else mCurrent++; break; case '/': case '%': case '&': case '*': case '|': case '+': case '-': case '~': if (*(mCurrent + 1) == '=') { skipAssignment(); } else mCurrent++; break; default: advance(); } } if (done) break; } return result; } QString CppTokenizer::getNumber() { QChar* offset = mCurrent; if (isDigitChar(*mCurrent)) { while (isDigitChar(*mCurrent) || isHexChar(*mCurrent)) { mCurrent++; //advance(); } } QString result; if (offset != mCurrent) { result = QString(offset,mCurrent-offset); if (*mCurrent=='.') // keep '.' for decimal result += *mCurrent; } return result; } QString CppTokenizer::getPreprocessor() { QChar *offset = mCurrent; skipToEOL(); return QString(offset, mCurrent-offset); } QString CppTokenizer::getWord(bool bSkipParenthesis, bool bSkipArray, bool bSkipBlock) { bool bFoundTemplate = false; // bIsSmartPointer:=False; // Skip spaces skipToNextToken(); // Get next word... QChar* offset = mCurrent; mCurrent++; // Copy the word ahead of us while (isIdentChar(*mCurrent) || isDigitChar(*mCurrent)) mCurrent++; QString currentWord; if (offset != mCurrent) { currentWord = QString(offset,mCurrent-offset); } // Append the operator characters and argument list to the operator word if ((currentWord == "operator") || (currentWord == "&operator") || (currentWord == "operator*") || (currentWord == "operator&")) { // Spaces between 'operator' and the operator itself are allowed while (isSpaceChar(*mCurrent)) mCurrent++; // Find end of operator while (isOperatorChar(*mCurrent)) mCurrent++; currentWord = QString(offset,mCurrent-offset); } else if (currentWord == "template") { bFoundTemplate = true; } QString result; // We found a word... if (!currentWord.isEmpty()) { result = currentWord; // Skip whitespace skipToNextToken(); // Skip template contents, but keep template variable types if (*mCurrent == '<') { offset = mCurrent; if (bFoundTemplate) { skipTemplateArgs(); } else if (skipAngleBracketPair()){ result += QString(offset, mCurrent-offset); skipToNextToken(); } } else if (bSkipArray && (*mCurrent == '[')) { // Append array stuff while(true) { offset = mCurrent; skipPair('[', ']'); result += QString(offset,mCurrent-offset); simplifyArgs(result); skipToNextToken(); if (*mCurrent!='[') //maybe multi-dimension array break; } } else if (bSkipBlock && (*mCurrent == '{')) { skipPair('{', '}'); skipToNextToken(); } // Keep parent/child operators if (*mCurrent == '.') { result+=*mCurrent; mCurrent++; } else if ((*mCurrent == '-') && (*(mCurrent + 1) == '>')) { result+=QString(mCurrent,2); mCurrent+=2; } else if ((*mCurrent == ':') && (*(mCurrent + 1) == ':') ) { if (result != "using") { result+=QString(mCurrent,2); mCurrent+=2; skipToNextToken(); if (isIdentChar(*mCurrent)) { // Append next token to this one QString s = getWord(bSkipParenthesis, bSkipArray, bSkipBlock); result += s; } } } } return result; } bool CppTokenizer::isArguments() { return *mCurrent == '('; } bool CppTokenizer::isForInit() { return (*mCurrent == '(') && (mLastToken == "for"); } bool CppTokenizer::isNumber() { return isDigitChar(*mCurrent); } bool CppTokenizer::isPreprocessor() { return *mCurrent=='#'; } bool CppTokenizer::isWord() { bool result = isLetterChar(*mCurrent); if (result && (*(mCurrent+1) == '"')) result = false; return result; } void CppTokenizer::simplify(QString &output) { //remove \n \r; QString temp; for (const QChar& ch:output) { if (!isLineChar(ch)) temp+=ch; } output = temp.trimmed(); } void CppTokenizer::simplifyArgs(QString &output) { QString temp; QString lastSpace = ""; bool parentheseStart = true; foreach (const QChar& ch,output.trimmed()) { if (isSpaceChar(ch)) { if (!parentheseStart) lastSpace+=ch; } else if (ch==','){ temp+=ch; lastSpace = ""; parentheseStart = false; } else if (ch=='(') { temp+=ch; lastSpace = ""; parentheseStart=true; } else if (ch==')') { temp+=ch; lastSpace = ""; parentheseStart = false; } else { parentheseStart=false; if (!lastSpace.isEmpty()) { temp+=" "; } lastSpace = ""; temp+=ch; } } output = temp; } void CppTokenizer::skipAssignment() { while (true) { switch ((*mCurrent).unicode()) { case '(': skipPair('(', ')'); break; case '"': skipDoubleQuotes(); break; case '\'': skipSingleQuote(); break; case '{': skipPair('{', '}'); // support struct initializers break; case '/': mCurrent++; break; default: if ((*mCurrent == 'R') && (*(mCurrent+1) == '"')) skipRawString(); else mCurrent++; } if (*mCurrent == ',' || *mCurrent ==';' || *mCurrent ==')' || *mCurrent =='}' || *mCurrent ==0) break; } } void CppTokenizer::skipDoubleQuotes() { mCurrent++; while (!(*mCurrent=='"' || *mCurrent == 0)) { if (*mCurrent == '\\') mCurrent+=2; // skip escaped char else mCurrent++; } if (*mCurrent!=0) { mCurrent++; } } void CppTokenizer::skipPair(const QChar &cStart, const QChar cEnd) { mCurrent++; while (*mCurrent != 0) { if (*mCurrent == '(') { skipPair('(', ')'); } else if (*mCurrent == '[') { skipPair('[', ']'); } else if (*mCurrent == '{') { skipPair('{', '}'); } else if (*mCurrent == cStart) { skipPair(cStart, cEnd); } else if (*mCurrent == cEnd) { mCurrent++; // skip over end break; } else if ((*mCurrent == 'R') && (*(mCurrent+1) == '"')) { if (cStart != '\'' && cStart!='\"') skipRawString(); // don't do it inside AnsiString! else mCurrent++; } else if (*mCurrent == '"') { if (cStart != '\'' && cStart!='\"') skipDoubleQuotes(); // don't do it inside AnsiString! else mCurrent++; } else if (*mCurrent == '\'') { if (cStart != '\'' && cStart!='\"') skipSingleQuote(); // don't do it inside AnsiString! else mCurrent++; } else { mCurrent++; } } } bool CppTokenizer::skipAngleBracketPair() { QChar* backup=mCurrent; QVector stack; while (*mCurrent != '\0') { switch((*mCurrent).unicode()) { case '<': case '(': case '[': stack.push_back(*mCurrent); break; case ')': while (!stack.isEmpty() && stack.back()!='(') { stack.pop_back(); } //pop up '(' if (stack.isEmpty()) { mCurrent=backup; return false; } stack.pop_back(); break; case ']': while (!stack.isEmpty() && stack.back()!='[') stack.pop_back(); //pop up '[' if (stack.isEmpty()) { mCurrent=backup; return false; } stack.pop_back(); break; case '>': if (stack.back()=='<') stack.pop_back(); if (stack.isEmpty()) { mCurrent++; return true; } break; case '{': case '}': case ';': case '"': case '\'': mCurrent=backup; return false; case '-': if (*(mCurrent+1)=='>') { mCurrent=backup; return false; } break; case '.': if (*(mCurrent+1)!='.') { mCurrent=backup; return false; } // skip while (*(mCurrent+1)=='.') mCurrent++; break; } mCurrent++; } mCurrent=backup; return false; } void CppTokenizer::skipRawString() { mCurrent++; //skip R bool noEscape = false; while(true) { mCurrent++; switch(mCurrent->unicode()) { case '(': noEscape = true; break; case ')': noEscape = false; break; } if (*mCurrent == 0) break; if ((*mCurrent == '"') && !noEscape) break; } if (*mCurrent!=0) mCurrent++; } void CppTokenizer::skipSingleQuote() { mCurrent++; while (!(*mCurrent=='\'' || *mCurrent == 0)) { if (*mCurrent == '\\') mCurrent+=2; // skip escaped char else mCurrent++; } if (*mCurrent!=0) { mCurrent++; } } void CppTokenizer::skipSplitLine() { mCurrent++; // skip '\' while ( isLineChar(*mCurrent)) // skip newline mCurrent++; } void CppTokenizer::skipTemplateArgs() { if (*mCurrent != '<') return; skipPair('<', '>'); } void CppTokenizer::skipToEOL() { while (true) { while (!isLineChar(*mCurrent) && (*mCurrent!=0)) { mCurrent++; } if (*mCurrent==0) return; bool splitLine = (*(mCurrent - 1) == '\\'); while (isLineChar(*mCurrent)) mCurrent++; if (!splitLine || *mCurrent==0) break; } } void CppTokenizer::skipToNextToken() { while (isSpaceChar(*mCurrent) || isLineChar(*mCurrent)) mCurrent++; } bool CppTokenizer::isIdentChar(const QChar &ch) { return ch=='_' || ch.isLetter() ; } int CppTokenizer::lambdasCount() const { return mLambdas.count(); } int CppTokenizer::indexOfFirstLambda() const { return mLambdas.front(); } void CppTokenizer::removeFirstLambda() { mLambdas.pop_front(); } void CppTokenizer::advance() { switch(mCurrent->unicode()) { case '\"': skipDoubleQuotes(); break; case '\'': skipSingleQuote(); break; case '\\': if (isLineChar(*(mCurrent + 1))) skipSplitLine(); else mCurrent++; break; case 'R': if (*(mCurrent+1) == '"') skipRawString(); else mCurrent++; break; default: mCurrent++; } } bool CppTokenizer::isLetterChar(const QChar &ch) { // return (ch>= 'A' && ch<='Z') // || (ch>='a' && ch<='z') return isIdentChar(ch) || ch == '_' || ch == '*' || ch == '&' || ch == '~'; } bool CppTokenizer::isHexChar(const QChar &ch) { return (ch >= 'A' && ch<='F') || (ch>='a' && ch<='f') || ch == 'x' || ch == 'L'; } bool CppTokenizer::isDigitChar(const QChar &ch) { return (ch>='0' && ch<='9'); } bool CppTokenizer::isSpaceChar(const QChar &ch) { return (ch == ' ' || ch == '\t'); } bool CppTokenizer::isLineChar(const QChar &ch) { return (ch=='\n' || ch=='\r'); } bool CppTokenizer::isBlankChar(const QChar &ch) { return (ch<=32) && (ch>0); } bool CppTokenizer::isOperatorChar(const QChar &ch) { switch (ch.unicode()) { case '+': case '-': case '/': case '*': case '[': case ']': case '=': case '%': case '!': case '&': case '|': case '>': case '<': case '^': return true; default: return false; } } bool CppTokenizer::currentWordEquals(QChar *wordStart, QChar *wordEnd, const QString& text) { QString currentWord(wordStart, wordEnd-wordStart); return currentWord == text; }