1 /** 2 Copyright: Copyright (c) 2017-2018 Andrey Penechko. 3 License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0). 4 Authors: Andrey Penechko. 5 */ 6 module voxelman.text.lexer; 7 8 //import std.array; 9 import std.range; 10 import std.uni; 11 import std.utf : byDchar, decodeFront; 12 import std.stdio; 13 14 struct Stack(T) 15 { 16 import std.array; 17 T[] data; 18 @property bool empty(){ return data.empty; } 19 @property size_t length(){ return data.length; } 20 void push(T val){ data ~= val; } 21 T pop() 22 { 23 assert(!empty); 24 auto val = data[$ - 1]; 25 data = data[0 .. $ - 1]; 26 if (!__ctfe) 27 cast(void)data.assumeSafeAppend(); 28 return val; 29 } 30 } 31 32 enum TokenType 33 { 34 SOI, // start of input 35 Invalid, 36 Hexadecimal, 37 Binary, 38 Decimal, 39 String, 40 LabelDefinition, 41 ReservedWord, 42 BinaryOpCode, 43 UnaryOpCode, 44 Register, 45 Identifier, 46 OpenBracket, 47 CloseBracket, 48 Plus, 49 Minus, 50 Multiply, 51 Divide, 52 Comma, 53 EOI // end of input 54 } 55 56 struct Token 57 { 58 TokenType type; 59 StreamPos start; 60 StreamPos end; 61 } 62 63 struct StreamPos 64 { 65 int pos = -1; 66 } 67 68 struct CharStream(R) 69 if (isForwardRange!R && is(ElementType!R : dchar)) 70 { 71 R originalInput; 72 73 struct StreamState 74 { 75 dchar current = '\2'; // start of text 76 bool empty; 77 StreamPos currentPos; 78 R input; 79 size_t currentOffset; 80 } 81 82 StreamState _state; 83 alias _state this; 84 85 Stack!StreamState _checkpointStack; 86 87 this(R inp) 88 { 89 originalInput = input = inp; 90 next(); 91 } 92 93 /// Returns false if input is empty 94 /// Updates current and returns true otherwise 95 bool next() 96 { 97 if (input.empty) 98 { 99 //writefln("next empty front %s ", current); 100 if (!this.empty) // advance past last char 101 { 102 ++currentPos.pos; 103 currentOffset = originalInput.length - this.input.length; 104 current = '\3'; // end of text 105 } 106 107 this.empty = true; 108 return false; 109 } 110 111 currentOffset = originalInput.length - this.input.length; 112 current = decodeFront!(Yes.useReplacementDchar)(input); 113 ++currentPos.pos; 114 //input.popFront(); 115 //writefln("next, state %s", _state); 116 return true; 117 } 118 119 /// Skips zero or more whitespace chars consuming input 120 void skipSpace() 121 { 122 while (isWhite(current) && next()) {} 123 } 124 125 /// Matches all chars from str consuming input and returns true 126 /// If fails consumes no input and returns false 127 bool match(R)(R str) 128 if (isInputRange!R && is(ElementType!R : dchar)) 129 { 130 pushCheckpoint; 131 foreach (dchar item; str.byDchar) 132 { 133 if (this.empty) 134 { 135 popCheckpoint; 136 return false; 137 } 138 139 if (toLower(item) != toLower(current)) 140 { 141 popCheckpoint; 142 return false; 143 } 144 145 next(); 146 } 147 148 discardCheckpoint; 149 return true; 150 } 151 152 bool match(dchar chr) 153 { 154 //if (input.empty) return false; 155 156 if (current == chr) 157 { 158 next(); 159 return true; 160 } 161 162 return false; 163 } 164 165 bool matchCase(R)(R str) 166 if (isInputRange!R && is(ElementType!R : dchar)) 167 { 168 pushCheckpoint; 169 foreach (dchar item; str.byDchar) 170 { 171 if (this.empty) 172 { 173 popCheckpoint; 174 return false; 175 } 176 177 if (item != current) 178 { 179 popCheckpoint; 180 return false; 181 } 182 183 next(); 184 } 185 186 discardCheckpoint; 187 return true; 188 } 189 190 /// Matches single char 191 bool match(alias pred)() 192 { 193 //if (this.empty) return false; 194 195 if (pred(current)) 196 { 197 next(); 198 return true; 199 } 200 201 return false; 202 } 203 204 bool matchAnyOf(dchar[] options...) 205 { 206 //if (this.empty) return false; 207 208 foreach (option; options) 209 { 210 if (option == current) 211 { 212 next(); 213 return true; 214 } 215 } 216 217 return false; 218 } 219 220 bool matchOpt(dchar optional) 221 { 222 match(optional); 223 return true; 224 } 225 226 /// save current stream position 227 void pushCheckpoint() { 228 _checkpointStack.push(_state); 229 } 230 231 /// restore saved position 232 void discardCheckpoint() { 233 _checkpointStack.pop; 234 } 235 236 /// restore saved position 237 void popCheckpoint() { 238 _state = _checkpointStack.pop; 239 } 240 } 241 242 /* 243 TokenMatcher(ctRegex!(r"^(0x[0-9ABCDEF]+)\b","i"), TokenType.Hexadecimal), 244 TokenMatcher(ctRegex!(r"^(0b[0-1]+)\b","i"), TokenType.Binary), 245 TokenMatcher(ctRegex!(r"^([0-9]+)\b"), TokenType.Decimal), 246 TokenMatcher(ctRegex!( "^(\".*\")"), TokenType.String), 247 TokenMatcher(ctRegex!(r"^((:[0-9A-Za-z_]+)|([0-9A-Za-z_]+:))"), TokenType.LabelDefinition), 248 TokenMatcher(ctRegex!(r"^(POP|PUSH|PEEK|PICK|DAT|DATA|DW|WORD)\b","i"), TokenType.ReservedWord), 249 TokenMatcher(ctRegex!(r"^(SET|ADD|SUB|MUL|MLI|DIV|DVI|MOD|MDI|AND|BOR|XOR|SHR|ASR|SHL|IFB|IFC|IFE|IFN|IFG|IFA|IFL|IFU|ADX|SBX|STI|STD)\b","i"), TokenType.BinaryOpCode), 250 TokenMatcher(ctRegex!(r"^(JSR|INT|IAG|IAS|RFI|IAQ|HWN|HWQ|HWI)\b", "i"), TokenType.UnaryOpCode), 251 TokenMatcher(ctRegex!(r"^([ABCXYZIJ]|SP|PC|EX)\b","i"), TokenType.Register), 252 TokenMatcher(ctRegex!(r"^([0-9A-Za-z_]+)"), TokenType.Identifier), 253 TokenMatcher(ctRegex!(r"^\["), TokenType.OpenBracket), 254 TokenMatcher(ctRegex!(r"^\+"), TokenType.Plus), 255 TokenMatcher(ctRegex!(r"^-"), TokenType.Minus), 256 TokenMatcher(ctRegex!(r"^\*"), TokenType.Multiply), 257 TokenMatcher(ctRegex!(r"^/"), TokenType.Divide), 258 TokenMatcher(ctRegex!(r"^\]"), TokenType.CloseBracket), 259 TokenMatcher(ctRegex!("^,"), TokenType.Comma), 260 */ 261 262 bool isDigit(dchar chr) pure nothrow 263 { 264 return '0' <= chr && chr <= '9'; 265 } 266 267 bool isHexDigit(dchar chr) pure nothrow 268 { 269 return 270 '0' <= chr && chr <= '9' || 271 'a' <= chr && chr <= 'f' || 272 'A' <= chr && chr <= 'F'; 273 } 274 275 struct TokenMatcher 276 { 277 bool delegate() matcher; 278 TokenType type; 279 } 280 281 alias StringLexer = Lexer!string; 282 struct Lexer(R) 283 { 284 CharStream!R input; 285 TokenMatcher[] matchers; 286 Token current; 287 bool empty; 288 289 int opApply(scope int delegate(in Token) del) 290 { 291 do 292 { 293 if (auto ret = del(current)) 294 return ret; 295 next(); 296 } 297 while (!empty); 298 return 0; 299 } 300 301 bool matchHexNumber() 302 { 303 if (!input.match("0x")) return false; 304 if (!input.match!isHexDigit) return false; 305 while (input.match!isHexDigit) {} 306 return true; 307 } 308 309 bool matchComment() 310 { 311 if (!input.match("/")) return false; 312 if (!input.match!isHexDigit) return false; 313 while (input.match!isHexDigit) {} 314 return true; 315 } 316 317 private void next() 318 { 319 if (checkInputState) return; 320 321 foreach (matcher; matchers) 322 { 323 input.pushCheckpoint; 324 StreamPos startPos = input.currentPos; 325 326 bool matchSuccess = matcher.matcher(); 327 328 if (matchSuccess) 329 { 330 current = Token(matcher.type, startPos, input.currentPos); 331 //writefln("success on %s, state %s", matcher.type, input._state); 332 input.discardCheckpoint; 333 return; 334 } 335 336 input.popCheckpoint; 337 //writefln("fail %s", matcher.type); 338 } 339 340 current = Token(TokenType.Invalid, input.currentPos); 341 } 342 343 // returns true if no matching should be done 344 private bool checkInputState() 345 { 346 if (input.empty) 347 { 348 if (current.type == TokenType.EOI) // on second try mark as empty and return 349 { 350 empty = true; 351 } 352 else // when input just became empty emit EOI token, but do not mark us as empty 353 { 354 current = Token(TokenType.EOI, input.currentPos); 355 } 356 357 return true; // exit matching 358 } 359 360 return false; // continue matching 361 } 362 }