1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """
24 Shipyard is a module to process data in a format inspired by
25 email headers (RFC 2822).
26
27 ===========
28 File format
29 ===========
30
31
32
33 Character encoding
34 ==================
35
36 A character encoding can be specified similar to :pep:`0263` using::
37
38 # -*- coding: <encoding name> -*-
39
40 in the first line. ``#`` is replaced with the actual `comment`_ mark.
41
42 More precisely, the first line must match the regular
43 expression::
44
45 ^#.*coding[:=]\s*([-\w.]+)
46
47 Again ``#`` is replaced by the actual `comment`_ mark. The first group
48 of this expression is then interpreted as encoding name.
49
50
51 Data set
52 ========
53
54 A *data set* consists of zero or more `records <#record>`__ separated
55 by one or more empty lines.
56
57 Comment
58 =======
59 Lines starting with the *comment mark* (default: ``#``) are
60 ignored. Comments can be used in or between `records <#record>`__.
61
62
63 Record
64 ======
65 A *record* consists of one or more `fields <#field>`__
66
67 Field
68 =====
69
70 A *field* is a line that has the form::
71
72 key: value
73
74 *key* is a string that
75 - doesn't contain a colon
76 - doesn't start with the `comment`_ mark
77 - doesn't start with the `continuation`_ mark
78
79 *value* is an arbitrary string. It can span multiple line using
80 `continuation`_ marks.
81
82
83 Continuation
84 ============
85 If a line starts with the *continuation mark* (default: " " [one blank])
86 it gets appended to the preceding line, with the
87 continuation mark removed.
88
89
90
91
92
93 =====
94 Usage
95 =====
96
97 Obviously we need to import shipyard:
98 >>> import shipyard
99
100 First we open the file:
101 >>> input = open('nobel.sy')
102
103 Then we create a parser object:
104 >>> reader = shipyard.Parser(keep_linebreaks=False,
105 ... keys=['id', 'discipline', 'year',
106 ... 'name', 'country', 'rationale'])
107
108 For every record the given keys are initialized with None.
109
110 Now we can iterater through the records:
111
112 >>> for record in reader.parse(input): # doctest:+ELLIPSIS
113 ... print record['country']
114 United States
115 Japan
116 United States
117 ...
118
119 Instead of iterating we may want to get a list of dicts:
120 >>> input.seek(0)
121 >>> lod = reader.get_list(input)
122 >>> print lod # doctest:+ELLIPSIS
123 [{u'discipline': u'Chemistry', u'name': u'Martin Chalfie', ...}, {u'discipline': u'Chemistry', u'name': u'Osamu Shimomura', ...}, ...]
124
125 Sometimes we need a dict of dicts (using the 'id' field as key):
126 >>> input.seek(0)
127 >>> dod = reader.get_dict(input, key='id')
128 >>> print dod.keys()
129 [u'11', u'10', u'1', u'0', u'3', u'2', u'5', u'4', u'7', u'6', u'9', u'8']
130 >>> print dod[u'5'][u'rationale']
131 for the discovery of the mechanism of spontaneous brokensymmetry in subatomic physics
132
133
134 If we don't want dicts we can use the 'factory' parameter:
135 >>> input.seek(0)
136 >>> los = reader.get_list(input, factory = lambda **keys: ', '.join(keys.values()))
137 >>> print los[0]
138 Chemistry, Martin Chalfie, United States, for the discovery and development of the green fluorescentprotein, GFP, 2008, 0
139
140 Of course a class works as a factory, too:
141 >>> input.seek(0)
142 >>> class Laureate(object):
143 ... def __init__(self, id, discipline, year, name, country, rationale):
144 ... self.name = name
145 >>> doo = reader.get_dict(input, key='id', factory = Laureate)
146 >>> print doo[u'2'] # doctest:+ELLIPSIS
147 <Laureate object at ...>
148 >>> print doo[u'2'].name
149 Roger Y. Tsien
150
151 Now let's write a Shipyard file.
152
153 First we create a StringIO (any other file-like object will do, too):
154 >>> import StringIO
155 >>> output = StringIO.StringIO()
156
157 Next we need a Writer object:
158 >>> writer = shipyard.Writer(keys=('foo', 'bar'), coding='utf-8')
159
160 Now we can use write() to write a single record:
161 >>> writer.write(output, {'foo': 1, 'bar': 2})
162 >>> print output.getvalue()
163 foo: 1
164 bar: 2
165 <BLANKLINE>
166 <BLANKLINE>
167
168
169 Using write_many() we can write a list of records:
170 >>> output = StringIO.StringIO()
171 >>> d = [dict((('foo', i), ('bar', 2*i))) for i in range(3)]
172 >>> writer.write_many(output, d)
173 >>> print output.getvalue()
174 foo: 0
175 bar: 0
176 <BLANKLINE>
177 foo: 1
178 bar: 2
179 <BLANKLINE>
180 foo: 2
181 bar: 4
182 <BLANKLINE>
183 <BLANKLINE>
184
185
186 To get a encoding line we use write_coding():
187 >>> output = StringIO.StringIO()
188 >>> writer.write_coding(output)
189 >>> print output.getvalue()
190 #-*- coding: utf-8 -*-
191 <BLANKLINE>
192 <BLANKLINE>
193
194 Now let's do everything at once using write_full():
195 >>> output = StringIO.StringIO()
196 >>> writer.write_full(output, d)
197 >>> print output.getvalue()
198 #-*- coding: utf-8 -*-
199 <BLANKLINE>
200 foo: 0
201 bar: 0
202 <BLANKLINE>
203 foo: 1
204 bar: 2
205 <BLANKLINE>
206 foo: 2
207 bar: 4
208 <BLANKLINE>
209 <BLANKLINE>
210
211
212 """
213
214 import re
215
216
218 """
219 Something is wrong with a line
220
221 :see: `Parser.parse()`
222 """
223 pass
224
226 """
227 Something is wrong with a key
228
229 :see: `Parser.parse()`
230 """
231 pass
232
233
235 """
236 Reader for Shipyard files
237 """
238 - def __init__(self, keys=None, defaults=None,
239 keep_linebreaks=True,
240 comment='#', continuation=' ', encode=True):
241 """
242 Constructor
243
244 :Parameters:
245 keys : list of strings
246 list of keys this parser accepts
247 defaults : dict
248 default values for records
249 keep_linebreaks : bool
250 if True linebreaks in continuation lines are kept
251 comment : string
252 mark that starts a comment line
253 continuation : string
254 mark that starts a continuation line
255 encode : bool
256 True if coding marks should evaluated
257 """
258 self.defaults=defaults
259 if keys is not None:
260 self.keys = set(keys)
261 else:
262 self.keys = ()
263 self.keep_linebreaks=keep_linebreaks
264 self.comment = comment
265 self.continuation = continuation
266 self.encode = encode
267
269 """
270 Add missing keys to a record
271
272 :Parameters:
273 record : dict
274 record zu fill
275 """
276 if self.defaults is None:
277 func=lambda key: None
278 else:
279 func=lambda key: self.defaults.get(key, None)
280 if record:
281 for k in self.keys:
282 if not k in record:
283 record[k] = func(k)
284 return record
285
286
287 - def parse(self, inpt, factory=None):
288 """
289 Iterator that returns the next record each time it is called
290
291 :Parameters:
292 inpt : iterable containing strings (e.g. a file)
293 input to parse
294 factory : callable to create the return values.
295 If factory is not None, for every record the result of
296 ``factory(**record)`` is returned
297
298 :Exceptions:
299 - `InvalidLineError`: if a continuation marks without a
300 previous data line is found
301 - `InvalidLineError`: if a data line without a ':' is found
302 - `InvalidKeyError`: if in `__init__()` ``keys`` is given
303 and a key is found that is not in keys
304
305 :see: Parser.get_list()
306 """
307 record = {}
308 key = None
309 coding = None
310 is_first_line = True
311 for num, line in enumerate(inpt):
312 if coding is not None:
313 line = line.decode(coding)
314
315 if line.strip() == '':
316 record = self.fill_record(record)
317 if record:
318 if factory is not None:
319 yield factory(**record)
320 else:
321 yield record
322 record = {}
323 key = None
324 elif line.startswith(self.comment):
325 if self.encode and is_first_line:
326 match = re.search(r"coding[:=]\s*([-\w.]+)", line)
327 if match:
328 coding = match.group(1)
329 elif line.startswith(self.continuation):
330 if key is not None:
331 if self.keep_linebreaks:
332 record[key] += '\n' + line[ len(self.continuation):
333 ].rstrip('\n')
334 else:
335 record[key] += line[ len(self.continuation):
336 ].rstrip('\n')
337 else:
338 raise InvalidLineError('Invalid line %s'%num)
339 else:
340 try:
341 key, value = line.split(':', 1)
342 except Exception, e:
343 print line
344 raise InvalidLineError('Invalid line %s: %s'%(num, e))
345 if factory is not None:
346 key = key.encode('utf-8')
347 key = key.strip()
348 if self.keys and key not in self.keys:
349 raise InvalidKeyError('Invalid key "%s" on line %s'%
350 (key, num))
351 record[key] = value.lstrip().rstrip('\n')
352 is_first_line = False
353
354 record = self.fill_record(record)
355 if record:
356 if factory is not None:
357 yield factory(**record)
358 else:
359 yield record
360
361
362 - def get_list(self, inpt, factory=None):
363 """
364 Returns a list of all records
365
366 :Parameters:
367 inpt : iterable containing strings (e.g. a file)
368 input to parse
369 factory : callable to create the return values.
370 If factory is not None, for every record the result of
371 ``factory(**record)`` is returned
372
373 :Exceptions:
374 - `InvalidLineError`: if a continuation marks without a
375 previous data line is found
376 - `InvalidLineError`: if a data line without a ':' is found
377 - `InvalidKeyError`: if in `__init__()` ``keys`` is given
378 and a key is found that is not in keys
379
380 :see: Parser.parse()
381 """
382 result = []
383 for record in self.parse(inpt, factory):
384 result.append(record)
385 return result
386
387
388 - def get_dict(self, inpt, key, factory=None):
389 """
390 Returns a dict of all records
391
392 :Parameters:
393 inpt : iterable containing strings (e.g. a file)
394 input to parse
395 key : string
396 name of the field used as key for the result dict
397 factory : callable to create the return values.
398 If factory is not None, for every record the result of
399 ``factory(**record)`` is returned
400
401 :Exceptions:
402 - `InvalidLineError`: if a continuation marks without a
403 previous data line is found
404 - `InvalidLineError`: if a data line without a ':' is found
405 - `InvalidKeyError`: if in `__init__()` ``keys`` is given
406 and a key is found that is not in keys
407
408 :see: Parser.parse()
409 """
410 result = {}
411 for record in self.parse(inpt, factory=dict):
412
413
414
415
416 if factory is None:
417 result[record[key]] = record
418 else:
419 result[record[key]] = factory(**record)
420 return result
421
422
423
424
425
427 """
428 Writer for Shipyard files
429 """
430
431 - def __init__(self, keys=None, comment='#', continuation=' ', coding=None,
432 ignore_values=None):
433 """
434 Constructor
435
436 :Parameters:
437 keys : list of strings
438 list of keys this writer accepts
439 comment : string
440 mark that starts a comment line
441 continuation : string
442 mark that starts a continuation line
443 coding : string
444 character encoding to use
445 ignore_values : set or ``None``
446 fields containing this values aren't written
447 """
448 self.keys = keys
449 self.comment = comment
450 self.continuation = continuation
451 self.coding = coding
452 if ignore_values is None:
453 self.ignore_values = set()
454 else:
455 self.ignore_values = ignore_values
456
457 - def write(self, output, record):
458 """
459 Write a record
460
461 If in `__init__()` ``keys`` is given only those values are written
462
463 :Parameters:
464 output : needs a method ``write`` that takes a string parameter
465 output to write to
466 record : ``dict``
467 record to write
468 """
469
470 if self.keys:
471 keys = self.keys
472 else:
473 keys = record.keys()
474 for key in keys:
475 current=record[key]
476 if current in self.ignore_values:
477 continue
478 try:
479 value = record[key].encode(self.coding)
480 except AttributeError:
481 value = str(record[key]).encode(self.coding)
482 except Exception, e:
483 print e
484 value = str(record[key])
485 lines = value.splitlines()
486 value = ('\n'+self.continuation).join(lines)
487 output.write('%s: %s\n'%(key, value))
488
489 output.write('\n')
490
492 """
493 Write a list record
494
495 If in `__init__()` ``keys`` is given only those values are written
496
497 :Parameters:
498 output : needs a method ``write`` that takes a string parameter
499 output to write to
500 records : list of dicts
501 records to write
502 """
503 for rec in records:
504 self.write(output, rec)
505
521
522 - def write_coding(self, output, template='-*- coding: %s -*-\n\n'):
523 """
524 Write a coding line for the coding given in __init__().
525 Does nothing if no coding is given.
526
527 :Parameters:
528 output : needs a method ``write`` that takes a string parameter
529 output to write to
530 template : string
531 template string for the coding line. A comment mark is prepended
532 """
533
534 if self.coding is not None:
535 output.write(self.comment+(template%self.coding))
536
538 """
539 Writes a list of records. An outpout file is created if
540 ``output`` is a string.
541 A coding line is writen if ``self.coding`` is not None.
542
543 :Parameters:
544 output : string or file-like
545 outpout to write to. If it is an instance of basestring
546 it's taken as a file name, otherwise its write() methode is used
547 records : list of dicts
548 records to write
549 """
550 needs_close = False
551 if isinstance(output, basestring):
552 output = open(output, 'w')
553 needs_close = True
554 if self.coding is not None:
555 self.write_coding(output)
556 self.write_many(output, records)
557 if needs_close:
558 output.close()
559