14: Lexical Scanner Implementation (4)
2018/10/29
I write a scanner to get tokens. The code is at lexer.rs
.
The algorithm is simple. The scanner will read char by char. There are two cursors, cursor_l
and cursor_r
. When the income char is one of separators or delimiters, the scanner will check the last word selected by two cursors. If the word is a keyword, add the keyword as the token. Otherwise, add the identifier with its name.
There are some small parts need to fix, but the function is almost done.
If we have a message:
select customername, contactname, address from customers where address is null;
then the scanner will get:
[
Symbol { name: "select", len: 6, token: Select, group: Keyword },
Symbol { name: "customername", len: 12, token: Identifier, group: Identifier },
Symbol { name: ",", len: 1, token: Comma, group: Delimiter },
Symbol { name: "", len: 0, token: Identifier, group: Identifier },
Symbol { name: "contactname", len: 11, token: Identifier, group: Identifier },
Symbol { name: ",", len: 1, token: Comma, group: Delimiter },
Symbol { name: "", len: 0, token: Identifier, group: Identifier },
Symbol { name: "address", len: 7, token: Identifier, group: Identifier },
Symbol { name: "from", len: 4, token: From, group: Keyword },
Symbol { name: "customers", len: 9, token: Identifier, group: Identifier },
Symbol { name: "where", len: 5, token: Where, group: Keyword },
Symbol { name: "address", len: 7, token: Identifier, group: Identifier },
Symbol { name: "is", len: 2, token: Identifier, group: Identifier },
Symbol { name: "null", len: 4, token: Identifier, group: Identifier },
Symbol { name: ";", len: 1, token: Semicolon, group: Delimiter }
]
is null
should be recognized as a token, so I will fix later.
as you can see, we get the tokens and we can use these token to do the next step.
sql/lexer.rs
use sql::symbol;
#[derive(Debug, Clone)]
pub struct Scanner<'a> {
message: String,
tokens: Vec<symbol::Symbol<'a>>,
pos: Pos,
}
#[derive(Debug, Clone)]
struct Pos {
cursor_l: usize,
cursor_r: usize,
}
impl<'a> Scanner<'a> {
pub fn new(message: &str) -> Scanner {
Scanner {
message: message.to_lowercase().trim().to_string(),
tokens: vec![],
pos: Pos {
cursor_l: 0,
cursor_r: 0,
},
}
}
pub fn scan_tokens(&'a mut self) -> Vec<symbol::Symbol<'a>> {
println!("Starting scanning message: {}", self.message);
let mut chars = self.message.chars();
loop {
match chars.next() {
Some(x) => {
if is_letter_or_number(x) {
self.pos.cursor_r += 1;
} else {
match x {
' ' | '\t' | '\r' | '\n' | '(' | ')' | ','
| ';' => {
if self.pos.cursor_l != self.pos.cursor_r {
let word = self
.message
.get(
self.pos.cursor_l
..self.pos.cursor_r,
).unwrap();
println!(
"encounter `{}`, last word is {}",
x, word
);
match symbol::SYMBOLS.get(word) {
// either keyword
Some(token) => {
self.tokens.push(token.clone())
}
// or identifier
None => {
self.tokens.push(symbol::sym(
word,
symbol::Token::Identifier,
symbol::Group::Identifier,
));
}
}
if is_delimiter(x) {
self.tokens.push(
symbol::Symbol::match_delimiter(x)
.unwrap(),
);
}
}
// set the cursor next to `x` in the right
self.pos.cursor_r += 1;
self.pos.cursor_l = self.pos.cursor_r;
}
_ => {
// error
}
}
}
}
// no remaining char in message
None => break,
};
}
self.tokens.clone()
}
}
fn is_letter_or_number(ch: char) -> bool {
ch.is_digit(10) || ch.is_ascii_alphabetic()
}
fn is_delimiter(ch: char) -> bool {
ch == '(' || ch == ')' || ch == ',' || ch == ';'
}