1 /**
2 Copyright: Copyright (c) 2017-2018 Andrey Penechko.
3 License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0).
4 Authors: Andrey Penechko.
5 */
6 module voxelman.text.lexer;
7 
8 //import std.array;
9 import std.range;
10 import std.uni;
11 import std.utf : byDchar, decodeFront;
12 import std.stdio;
13 
14 struct Stack(T)
15 {
16 	import std.array;
17 	T[] data;
18 	@property bool empty(){ return data.empty; }
19 	@property size_t length(){ return data.length; }
20 	void push(T val){ data ~= val; }
21 	T pop()
22 	{
23 		assert(!empty);
24 		auto val = data[$ - 1];
25 		data = data[0 .. $ - 1];
26 		if (!__ctfe)
27 			cast(void)data.assumeSafeAppend();
28 		return val;
29 	}
30 }
31 
32 enum TokenType
33 {
34 	SOI, // start of input
35 	Invalid,
36 	Hexadecimal,
37 	Binary,
38 	Decimal,
39 	String,
40 	LabelDefinition,
41 	ReservedWord,
42 	BinaryOpCode,
43 	UnaryOpCode,
44 	Register,
45 	Identifier,
46 	OpenBracket,
47 	CloseBracket,
48 	Plus,
49 	Minus,
50 	Multiply,
51 	Divide,
52 	Comma,
53 	EOI // end of input
54 }
55 
56 struct Token
57 {
58 	TokenType type;
59 	StreamPos start;
60 	StreamPos end;
61 }
62 
63 struct StreamPos
64 {
65 	int pos = -1;
66 }
67 
68 struct CharStream(R)
69 	if (isForwardRange!R && is(ElementType!R : dchar))
70 {
71 	R originalInput;
72 
73 	struct StreamState
74 	{
75 		dchar current = '\2'; // start of text
76 		bool empty;
77 		StreamPos currentPos;
78 		R input;
79 		size_t currentOffset;
80 	}
81 
82 	StreamState _state;
83 	alias _state this;
84 
85 	Stack!StreamState _checkpointStack;
86 
87 	this(R inp)
88 	{
89 		originalInput = input = inp;
90 		next();
91 	}
92 
93 	/// Returns false if input is empty
94 	/// Updates current and returns true otherwise
95 	bool next()
96 	{
97 		if (input.empty)
98 		{
99 			//writefln("next empty front %s ", current);
100 			if (!this.empty) // advance past last char
101 			{
102 				++currentPos.pos;
103 				currentOffset = originalInput.length - this.input.length;
104 				current = '\3'; // end of text
105 			}
106 
107 			this.empty = true;
108 			return false;
109 		}
110 
111 		currentOffset = originalInput.length - this.input.length;
112 		current = decodeFront!(Yes.useReplacementDchar)(input);
113 		++currentPos.pos;
114 		//input.popFront();
115 		//writefln("next, state %s", _state);
116 		return true;
117 	}
118 
119 	/// Skips zero or more whitespace chars consuming input
120 	void skipSpace()
121 	{
122 		while (isWhite(current) && next()) {}
123 	}
124 
125 	/// Matches all chars from str consuming input and returns true
126 	/// If fails consumes no input and returns false
127 	bool match(R)(R str)
128 		if (isInputRange!R && is(ElementType!R : dchar))
129 	{
130 		pushCheckpoint;
131 		foreach (dchar item; str.byDchar)
132 		{
133 			if (this.empty)
134 			{
135 				popCheckpoint;
136 				return false;
137 			}
138 
139 			if (toLower(item) != toLower(current))
140 			{
141 				popCheckpoint;
142 				return false;
143 			}
144 
145 			next();
146 		}
147 
148 		discardCheckpoint;
149 		return true;
150 	}
151 
152 	bool match(dchar chr)
153 	{
154 		//if (input.empty) return false;
155 
156 		if (current == chr)
157 		{
158 			next();
159 			return true;
160 		}
161 
162 		return false;
163 	}
164 
165 	bool matchCase(R)(R str)
166 		if (isInputRange!R && is(ElementType!R : dchar))
167 	{
168 		pushCheckpoint;
169 		foreach (dchar item; str.byDchar)
170 		{
171 			if (this.empty)
172 			{
173 				popCheckpoint;
174 				return false;
175 			}
176 
177 			if (item != current)
178 			{
179 				popCheckpoint;
180 				return false;
181 			}
182 
183 			next();
184 		}
185 
186 		discardCheckpoint;
187 		return true;
188 	}
189 
190 	/// Matches single char
191 	bool match(alias pred)()
192 	{
193 		//if (this.empty) return false;
194 
195 		if (pred(current))
196 		{
197 			next();
198 			return true;
199 		}
200 
201 		return false;
202 	}
203 
204 	bool matchAnyOf(dchar[] options...)
205 	{
206 		//if (this.empty) return false;
207 
208 		foreach (option; options)
209 		{
210 			if (option == current)
211 			{
212 				next();
213 				return true;
214 			}
215 		}
216 
217 		return false;
218 	}
219 
220 	bool matchOpt(dchar optional)
221 	{
222 		match(optional);
223 		return true;
224 	}
225 
226 	/// save current stream position
227 	void pushCheckpoint() {
228 		_checkpointStack.push(_state);
229 	}
230 
231 	/// restore saved position
232 	void discardCheckpoint() {
233 		_checkpointStack.pop;
234 	}
235 
236 	/// restore saved position
237 	void popCheckpoint() {
238 		_state = _checkpointStack.pop;
239 	}
240 }
241 
242 /*
243 	TokenMatcher(ctRegex!(r"^(0x[0-9ABCDEF]+)\b","i"), TokenType.Hexadecimal),
244 	TokenMatcher(ctRegex!(r"^(0b[0-1]+)\b","i"), TokenType.Binary),
245 	TokenMatcher(ctRegex!(r"^([0-9]+)\b"), TokenType.Decimal),
246 	TokenMatcher(ctRegex!( "^(\".*\")"), TokenType.String),
247 	TokenMatcher(ctRegex!(r"^((:[0-9A-Za-z_]+)|([0-9A-Za-z_]+:))"), TokenType.LabelDefinition),
248 	TokenMatcher(ctRegex!(r"^(POP|PUSH|PEEK|PICK|DAT|DATA|DW|WORD)\b","i"), TokenType.ReservedWord),
249 	TokenMatcher(ctRegex!(r"^(SET|ADD|SUB|MUL|MLI|DIV|DVI|MOD|MDI|AND|BOR|XOR|SHR|ASR|SHL|IFB|IFC|IFE|IFN|IFG|IFA|IFL|IFU|ADX|SBX|STI|STD)\b","i"), TokenType.BinaryOpCode),
250 	TokenMatcher(ctRegex!(r"^(JSR|INT|IAG|IAS|RFI|IAQ|HWN|HWQ|HWI)\b", "i"), TokenType.UnaryOpCode),
251 	TokenMatcher(ctRegex!(r"^([ABCXYZIJ]|SP|PC|EX)\b","i"), TokenType.Register),
252 	TokenMatcher(ctRegex!(r"^([0-9A-Za-z_]+)"), TokenType.Identifier),
253 	TokenMatcher(ctRegex!(r"^\["), TokenType.OpenBracket),
254 	TokenMatcher(ctRegex!(r"^\+"), TokenType.Plus),
255 	TokenMatcher(ctRegex!(r"^-"), TokenType.Minus),
256 	TokenMatcher(ctRegex!(r"^\*"), TokenType.Multiply),
257 	TokenMatcher(ctRegex!(r"^/"), TokenType.Divide),
258 	TokenMatcher(ctRegex!(r"^\]"), TokenType.CloseBracket),
259 	TokenMatcher(ctRegex!("^,"), TokenType.Comma),
260 */
261 
262 bool isDigit(dchar chr) pure nothrow
263 {
264 	return '0' <= chr && chr <= '9';
265 }
266 
267 bool isHexDigit(dchar chr) pure nothrow
268 {
269 	return
270 		'0' <= chr && chr <= '9' ||
271 		'a' <= chr && chr <= 'f' ||
272 		'A' <= chr && chr <= 'F';
273 }
274 
275 struct TokenMatcher
276 {
277 	bool delegate() matcher;
278 	TokenType type;
279 }
280 
281 alias StringLexer = Lexer!string;
282 struct Lexer(R)
283 {
284 	CharStream!R input;
285 	TokenMatcher[] matchers;
286 	Token current;
287 	bool empty;
288 
289 	int opApply(scope int delegate(in Token) del)
290 	{
291 		do
292 		{
293 			if (auto ret = del(current))
294 				return ret;
295 			next();
296 		}
297 		while (!empty);
298 		return 0;
299 	}
300 
301 	bool matchHexNumber()
302 	{
303 		if (!input.match("0x")) return false;
304 		if (!input.match!isHexDigit) return false;
305 		while (input.match!isHexDigit) {}
306 		return true;
307 	}
308 
309 	bool matchComment()
310 	{
311 		if (!input.match("/")) return false;
312 		if (!input.match!isHexDigit) return false;
313 		while (input.match!isHexDigit) {}
314 		return true;
315 	}
316 
317 	private void next()
318 	{
319 		if (checkInputState) return;
320 
321 		foreach (matcher; matchers)
322 		{
323 			input.pushCheckpoint;
324 			StreamPos startPos = input.currentPos;
325 
326 			bool matchSuccess = matcher.matcher();
327 
328 			if (matchSuccess)
329 			{
330 				current = Token(matcher.type, startPos, input.currentPos);
331 				//writefln("success on %s, state %s", matcher.type, input._state);
332 				input.discardCheckpoint;
333 				return;
334 			}
335 
336 			input.popCheckpoint;
337 			//writefln("fail %s", matcher.type);
338 		}
339 
340 		current = Token(TokenType.Invalid, input.currentPos);
341 	}
342 
343 	// returns true if no matching should be done
344 	private bool checkInputState()
345 	{
346 		if (input.empty)
347 		{
348 			if (current.type == TokenType.EOI) // on second try mark as empty and return
349 			{
350 				empty = true;
351 			}
352 			else // when input just became empty emit EOI token, but do not mark us as empty
353 			{
354 				current = Token(TokenType.EOI, input.currentPos);
355 			}
356 
357 			return true; // exit matching
358 		}
359 
360 		return false; // continue matching
361 	}
362 }