14: Lexical Scanner Implementation (4)

2018/10/29

I write a scanner to get tokens. The code is at lexer.rs.

The algorithm is simple. The scanner will read char by char. There are two cursors, cursor_l and cursor_r. When the income char is one of separators or delimiters, the scanner will check the last word selected by two cursors. If the word is a keyword, add the keyword as the token. Otherwise, add the identifier with its name.

There are some small parts need to fix, but the function is almost done.

If we have a message:

select customername, contactname, address from customers where address is null;

then the scanner will get:

[
    Symbol { name: "select", len: 6, token: Select, group: Keyword },
    Symbol { name: "customername", len: 12, token: Identifier, group: Identifier },
    Symbol { name: ",", len: 1, token: Comma, group: Delimiter },
    Symbol { name: "", len: 0, token: Identifier, group: Identifier },
    Symbol { name: "contactname", len: 11, token: Identifier, group: Identifier },
    Symbol { name: ",", len: 1, token: Comma, group: Delimiter },
    Symbol { name: "", len: 0, token: Identifier, group: Identifier },
    Symbol { name: "address", len: 7, token: Identifier, group: Identifier },
    Symbol { name: "from", len: 4, token: From, group: Keyword },
    Symbol { name: "customers", len: 9, token: Identifier, group: Identifier },
    Symbol { name: "where", len: 5, token: Where, group: Keyword },
    Symbol { name: "address", len: 7, token: Identifier, group: Identifier },
    Symbol { name: "is", len: 2, token: Identifier, group: Identifier },
    Symbol { name: "null", len: 4, token: Identifier, group: Identifier },
    Symbol { name: ";", len: 1, token: Semicolon, group: Delimiter }
]

is null should be recognized as a token, so I will fix later.

as you can see, we get the tokens and we can use these token to do the next step.

sql/lexer.rs

use sql::symbol;

#[derive(Debug, Clone)]
pub struct Scanner<'a> {
    message: String,
    tokens: Vec<symbol::Symbol<'a>>,
    pos: Pos,
}

#[derive(Debug, Clone)]
struct Pos {
    cursor_l: usize,
    cursor_r: usize,
}

impl<'a> Scanner<'a> {
    pub fn new(message: &str) -> Scanner {
        Scanner {
            message: message.to_lowercase().trim().to_string(),
            tokens: vec![],
            pos: Pos {
                cursor_l: 0,
                cursor_r: 0,
            },
        }
    }
    pub fn scan_tokens(&'a mut self) -> Vec<symbol::Symbol<'a>> {
        println!("Starting scanning message: {}", self.message);
        let mut chars = self.message.chars();
        loop {
            match chars.next() {
                Some(x) => {
                    if is_letter_or_number(x) {
                        self.pos.cursor_r += 1;
                    } else {
                        match x {
                            ' ' | '\t' | '\r' | '\n' | '(' | ')' | ','
                            | ';' => {
                                if self.pos.cursor_l != self.pos.cursor_r {
                                    let word = self
                                        .message
                                        .get(
                                            self.pos.cursor_l
                                                ..self.pos.cursor_r,
                                        ).unwrap();
                                    println!(
                                        "encounter `{}`, last word is {}",
                                        x, word
                                    );
                                    match symbol::SYMBOLS.get(word) {
                                        // either keyword
                                        Some(token) => {
                                            self.tokens.push(token.clone())
                                        }
                                        // or identifier
                                        None => {
                                            self.tokens.push(symbol::sym(
                                                word,
                                                symbol::Token::Identifier,
                                                symbol::Group::Identifier,
                                            ));
                                        }
                                    }
                                    if is_delimiter(x) {
                                        self.tokens.push(
                                            symbol::Symbol::match_delimiter(x)
                                                .unwrap(),
                                        );
                                    }
                                }
                                // set the cursor next to `x` in the right
                                self.pos.cursor_r += 1;
                                self.pos.cursor_l = self.pos.cursor_r;
                            }
                            _ => {
                                // error
                            }
                        }
                    }
                }
                // no remaining char in message
                None => break,
            };
        }
        self.tokens.clone()
    }
}

fn is_letter_or_number(ch: char) -> bool {
    ch.is_digit(10) || ch.is_ascii_alphabetic()
}

fn is_delimiter(ch: char) -> bool {
    ch == '(' || ch == ')' || ch == ',' || ch == ';'
}

results matching ""

    No results matching ""