Tokenizer.cs 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561
  1. // JsonKit v0.5 - A simple but flexible Json library in a single .cs file.
  2. //
  3. // Copyright (C) 2014 Topten Software (contact@toptensoftware.com) All rights reserved.
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this product
  6. // except in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed under the
  11. // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
  12. // either express or implied. See the License for the specific language governing permissions
  13. // and limitations under the License.
  14. // Define JsonKit_NO_DYNAMIC to disable Expando support
  15. // Define JsonKit_NO_EMIT to disable Reflection.Emit
  16. // Define JsonKit_NO_DATACONTRACT to disable support for [DataContract]/[DataMember]
  17. using System;
  18. using System.Collections.Generic;
  19. using System.Text;
  20. using System.IO;
  21. using System.Globalization;
  22. namespace Topten.JsonKit
  23. {
  24. public class Tokenizer
  25. {
  26. public Tokenizer(TextReader r, JsonOptions options)
  27. {
  28. _underlying = r;
  29. _options = options;
  30. FillBuffer();
  31. NextChar();
  32. NextToken();
  33. }
  34. private JsonOptions _options;
  35. private StringBuilder _sb = new StringBuilder();
  36. private TextReader _underlying;
  37. private char[] _buf = new char[4096];
  38. private int _pos;
  39. private int _bufUsed;
  40. private StringBuilder _rewindBuffer;
  41. private int _rewindBufferPos;
  42. private LineOffset _currentCharPos;
  43. private char _currentChar;
  44. private Stack<ReaderState> _bookmarks = new Stack<ReaderState>();
  45. public LineOffset CurrentTokenPosition;
  46. public Token CurrentToken;
  47. public LiteralKind LiteralKind;
  48. public string String;
  49. public object LiteralValue
  50. {
  51. get
  52. {
  53. if (CurrentToken != Token.Literal)
  54. throw new InvalidOperationException("token is not a literal");
  55. switch (LiteralKind)
  56. {
  57. case LiteralKind.Null: return null;
  58. case LiteralKind.False: return false;
  59. case LiteralKind.True: return true;
  60. case LiteralKind.String: return String;
  61. case LiteralKind.SignedInteger: return long.Parse(String, CultureInfo.InvariantCulture);
  62. case LiteralKind.UnsignedInteger:
  63. if (String.StartsWith("0x") || String.StartsWith("0X"))
  64. return Convert.ToUInt64(String.Substring(2), 16);
  65. else
  66. return ulong.Parse(String, CultureInfo.InvariantCulture);
  67. case LiteralKind.FloatingPoint: return double.Parse(String, CultureInfo.InvariantCulture);
  68. }
  69. return null;
  70. }
  71. }
  72. public Type LiteralType
  73. {
  74. get
  75. {
  76. if (CurrentToken != Token.Literal)
  77. throw new InvalidOperationException("token is not a literal");
  78. switch (LiteralKind)
  79. {
  80. case LiteralKind.Null: return typeof(Object);
  81. case LiteralKind.False: return typeof(Boolean);
  82. case LiteralKind.True: return typeof(Boolean);
  83. case LiteralKind.String: return typeof(string);
  84. case LiteralKind.SignedInteger: return typeof(long);
  85. case LiteralKind.UnsignedInteger: return typeof(ulong);
  86. case LiteralKind.FloatingPoint: return typeof(double);
  87. }
  88. return null;
  89. }
  90. }
  91. // This object represents the entire state of the reader and is used for rewind
  92. struct ReaderState
  93. {
  94. public ReaderState(Tokenizer tokenizer)
  95. {
  96. _currentCharPos = tokenizer._currentCharPos;
  97. _currentChar = tokenizer._currentChar;
  98. _string = tokenizer.String;
  99. _literalKind = tokenizer.LiteralKind;
  100. _rewindBufferPos = tokenizer._rewindBufferPos;
  101. _currentTokenPos = tokenizer.CurrentTokenPosition;
  102. _currentToken = tokenizer.CurrentToken;
  103. }
  104. public void Apply(Tokenizer tokenizer)
  105. {
  106. tokenizer._currentCharPos = _currentCharPos;
  107. tokenizer._currentChar = _currentChar;
  108. tokenizer._rewindBufferPos = _rewindBufferPos;
  109. tokenizer.CurrentToken = _currentToken;
  110. tokenizer.CurrentTokenPosition = _currentTokenPos;
  111. tokenizer.String = _string;
  112. tokenizer.LiteralKind = _literalKind;
  113. }
  114. private LineOffset _currentCharPos;
  115. private LineOffset _currentTokenPos;
  116. private char _currentChar;
  117. private Token _currentToken;
  118. private LiteralKind _literalKind;
  119. private string _string;
  120. private int _rewindBufferPos;
  121. }
  122. // Create a rewind bookmark
  123. public void CreateBookmark()
  124. {
  125. _bookmarks.Push(new ReaderState(this));
  126. if (_rewindBuffer == null)
  127. {
  128. _rewindBuffer = new StringBuilder();
  129. _rewindBufferPos = 0;
  130. }
  131. }
  132. // Discard bookmark
  133. public void DiscardBookmark()
  134. {
  135. _bookmarks.Pop();
  136. if (_bookmarks.Count == 0)
  137. {
  138. _rewindBuffer = null;
  139. _rewindBufferPos = 0;
  140. }
  141. }
  142. // Rewind to a bookmark
  143. public void RewindToBookmark()
  144. {
  145. _bookmarks.Pop().Apply(this);
  146. }
  147. // Fill buffer by reading from underlying TextReader
  148. void FillBuffer()
  149. {
  150. _bufUsed = _underlying.Read(_buf, 0, _buf.Length);
  151. _pos = 0;
  152. }
  153. // Get the next character from the input stream
  154. // (this function could be extracted into a few different methods, but is mostly inlined
  155. // for performance - yes it makes a difference)
  156. public char NextChar()
  157. {
  158. if (_rewindBuffer == null)
  159. {
  160. if (_pos >= _bufUsed)
  161. {
  162. if (_bufUsed > 0)
  163. {
  164. FillBuffer();
  165. }
  166. if (_bufUsed == 0)
  167. {
  168. return _currentChar = '\0';
  169. }
  170. }
  171. // Next
  172. _currentCharPos.Offset++;
  173. return _currentChar = _buf[_pos++];
  174. }
  175. if (_rewindBufferPos < _rewindBuffer.Length)
  176. {
  177. _currentCharPos.Offset++;
  178. return _currentChar = _rewindBuffer[_rewindBufferPos++];
  179. }
  180. else
  181. {
  182. if (_pos >= _bufUsed && _bufUsed > 0)
  183. FillBuffer();
  184. _currentChar = _bufUsed == 0 ? '\0' : _buf[_pos++];
  185. _rewindBuffer.Append(_currentChar);
  186. _rewindBufferPos++;
  187. _currentCharPos.Offset++;
  188. return _currentChar;
  189. }
  190. }
  191. // Read the next token from the input stream
  192. // (Mostly inline for performance)
  193. public void NextToken()
  194. {
  195. while (true)
  196. {
  197. // Skip whitespace and handle line numbers
  198. while (true)
  199. {
  200. if (_currentChar == '\r')
  201. {
  202. if (NextChar() == '\n')
  203. {
  204. NextChar();
  205. }
  206. _currentCharPos.Line++;
  207. _currentCharPos.Offset = 0;
  208. }
  209. else if (_currentChar == '\n')
  210. {
  211. if (NextChar() == '\r')
  212. {
  213. NextChar();
  214. }
  215. _currentCharPos.Line++;
  216. _currentCharPos.Offset = 0;
  217. }
  218. else if (_currentChar == ' ')
  219. {
  220. NextChar();
  221. }
  222. else if (_currentChar == '\t')
  223. {
  224. NextChar();
  225. }
  226. else
  227. break;
  228. }
  229. // Remember position of token
  230. CurrentTokenPosition = _currentCharPos;
  231. // Handle common characters first
  232. switch (_currentChar)
  233. {
  234. case '/':
  235. // Comments not support in strict mode
  236. if ((_options & JsonOptions.StrictParser) != 0)
  237. {
  238. throw new InvalidDataException(string.Format("syntax error, unexpected character '{0}'", _currentChar));
  239. }
  240. // Process comment
  241. NextChar();
  242. switch (_currentChar)
  243. {
  244. case '/':
  245. NextChar();
  246. while (_currentChar!='\0' && _currentChar != '\r' && _currentChar != '\n')
  247. {
  248. NextChar();
  249. }
  250. break;
  251. case '*':
  252. bool endFound = false;
  253. while (!endFound && _currentChar!='\0')
  254. {
  255. if (_currentChar == '*')
  256. {
  257. NextChar();
  258. if (_currentChar == '/')
  259. {
  260. endFound = true;
  261. }
  262. }
  263. NextChar();
  264. }
  265. break;
  266. default:
  267. throw new InvalidDataException("syntax error, unexpected character after slash");
  268. }
  269. continue;
  270. case '\"':
  271. case '\'':
  272. {
  273. _sb.Length = 0;
  274. var quoteKind = _currentChar;
  275. NextChar();
  276. while (_currentChar!='\0')
  277. {
  278. if (_currentChar == '\\')
  279. {
  280. NextChar();
  281. var escape = _currentChar;
  282. switch (escape)
  283. {
  284. case '\"': _sb.Append('\"'); break;
  285. case '\\': _sb.Append('\\'); break;
  286. case '/': _sb.Append('/'); break;
  287. case 'b': _sb.Append('\b'); break;
  288. case 'f': _sb.Append('\f'); break;
  289. case 'n': _sb.Append('\n'); break;
  290. case 'r': _sb.Append('\r'); break;
  291. case 't': _sb.Append('\t'); break;
  292. case 'u':
  293. var sbHex = new StringBuilder();
  294. for (int i = 0; i < 4; i++)
  295. {
  296. NextChar();
  297. sbHex.Append(_currentChar);
  298. }
  299. _sb.Append((char)Convert.ToUInt16(sbHex.ToString(), 16));
  300. break;
  301. default:
  302. throw new InvalidDataException(string.Format("Invalid escape sequence in string literal: '\\{0}'", _currentChar));
  303. }
  304. }
  305. else if (_currentChar == quoteKind)
  306. {
  307. String = _sb.ToString();
  308. CurrentToken = Token.Literal;
  309. LiteralKind = LiteralKind.String;
  310. NextChar();
  311. return;
  312. }
  313. else
  314. {
  315. _sb.Append(_currentChar);
  316. }
  317. NextChar();
  318. }
  319. throw new InvalidDataException("syntax error, unterminated string literal");
  320. }
  321. case '{': CurrentToken = Token.OpenBrace; NextChar(); return;
  322. case '}': CurrentToken = Token.CloseBrace; NextChar(); return;
  323. case '[': CurrentToken = Token.OpenSquare; NextChar(); return;
  324. case ']': CurrentToken = Token.CloseSquare; NextChar(); return;
  325. case '=': CurrentToken = Token.Equal; NextChar(); return;
  326. case ':': CurrentToken = Token.Colon; NextChar(); return;
  327. case ';': CurrentToken = Token.SemiColon; NextChar(); return;
  328. case ',': CurrentToken = Token.Comma; NextChar(); return;
  329. case '\0': CurrentToken = Token.EOF; return;
  330. }
  331. // Number?
  332. if (char.IsDigit(_currentChar) || _currentChar == '-')
  333. {
  334. TokenizeNumber();
  335. return;
  336. }
  337. // Identifier? (checked for after everything else as identifiers are actually quite rare in valid json)
  338. if (Char.IsLetter(_currentChar) || _currentChar == '_' || _currentChar == '$')
  339. {
  340. // Find end of identifier
  341. _sb.Length = 0;
  342. while (Char.IsLetterOrDigit(_currentChar) || _currentChar == '_' || _currentChar == '$')
  343. {
  344. _sb.Append(_currentChar);
  345. NextChar();
  346. }
  347. String = _sb.ToString();
  348. // Handle special identifiers
  349. switch (String)
  350. {
  351. case "true":
  352. LiteralKind = LiteralKind.True;
  353. CurrentToken = Token.Literal;
  354. return;
  355. case "false":
  356. LiteralKind = LiteralKind.False;
  357. CurrentToken = Token.Literal;
  358. return;
  359. case "null":
  360. LiteralKind = LiteralKind.Null;
  361. CurrentToken = Token.Literal;
  362. return;
  363. }
  364. CurrentToken = Token.Identifier;
  365. return;
  366. }
  367. // What the?
  368. throw new InvalidDataException(string.Format("syntax error, unexpected character '{0}'", _currentChar));
  369. }
  370. }
  371. // Parse a sequence of characters that could make up a valid number
  372. // For performance, we don't actually parse it into a number yet. When using Topten.JsonKitEmit we parse
  373. // later, directly into a value type to avoid boxing
  374. private void TokenizeNumber()
  375. {
  376. _sb.Length = 0;
  377. // Leading negative sign
  378. bool signed = false;
  379. if (_currentChar == '-')
  380. {
  381. signed = true;
  382. _sb.Append(_currentChar);
  383. NextChar();
  384. }
  385. // Hex prefix?
  386. bool hex = false;
  387. if (_currentChar == '0' && (_options & JsonOptions.StrictParser)==0)
  388. {
  389. _sb.Append(_currentChar);
  390. NextChar();
  391. if (_currentChar == 'x' || _currentChar == 'X')
  392. {
  393. _sb.Append(_currentChar);
  394. NextChar();
  395. hex = true;
  396. }
  397. }
  398. // Process characters, but vaguely figure out what type it is
  399. bool cont = true;
  400. bool fp = false;
  401. while (cont)
  402. {
  403. switch (_currentChar)
  404. {
  405. case '0':
  406. case '1':
  407. case '2':
  408. case '3':
  409. case '4':
  410. case '5':
  411. case '6':
  412. case '7':
  413. case '8':
  414. case '9':
  415. _sb.Append(_currentChar);
  416. NextChar();
  417. break;
  418. case 'A':
  419. case 'a':
  420. case 'B':
  421. case 'b':
  422. case 'C':
  423. case 'c':
  424. case 'D':
  425. case 'd':
  426. case 'F':
  427. case 'f':
  428. if (!hex)
  429. cont = false;
  430. else
  431. {
  432. _sb.Append(_currentChar);
  433. NextChar();
  434. }
  435. break;
  436. case '.':
  437. if (hex)
  438. {
  439. cont = false;
  440. }
  441. else
  442. {
  443. fp = true;
  444. _sb.Append(_currentChar);
  445. NextChar();
  446. }
  447. break;
  448. case 'E':
  449. case 'e':
  450. if (!hex)
  451. {
  452. fp = true;
  453. _sb.Append(_currentChar);
  454. NextChar();
  455. if (_currentChar == '+' || _currentChar == '-')
  456. {
  457. _sb.Append(_currentChar);
  458. NextChar();
  459. }
  460. }
  461. break;
  462. default:
  463. cont = false;
  464. break;
  465. }
  466. }
  467. if (char.IsLetter(_currentChar))
  468. throw new InvalidDataException(string.Format("syntax error, invalid character following number '{0}'", _sb.ToString()));
  469. // Setup token
  470. String = _sb.ToString();
  471. CurrentToken = Token.Literal;
  472. // Setup literal kind
  473. if (fp)
  474. {
  475. LiteralKind = LiteralKind.FloatingPoint;
  476. }
  477. else if (signed)
  478. {
  479. LiteralKind = LiteralKind.SignedInteger;
  480. }
  481. else
  482. {
  483. LiteralKind = LiteralKind.UnsignedInteger;
  484. }
  485. }
  486. // Check the current token, throw exception if mismatch
  487. public void Check(Token tokenRequired)
  488. {
  489. if (tokenRequired != CurrentToken)
  490. {
  491. throw new InvalidDataException(string.Format("syntax error, expected {0} found {1}", tokenRequired, CurrentToken));
  492. }
  493. }
  494. // Skip token which must match
  495. public void Skip(Token tokenRequired)
  496. {
  497. Check(tokenRequired);
  498. NextToken();
  499. }
  500. // Skip token if it matches
  501. public bool SkipIf(Token tokenRequired)
  502. {
  503. if (tokenRequired == CurrentToken)
  504. {
  505. NextToken();
  506. return true;
  507. }
  508. return false;
  509. }
  510. }
  511. }