1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
18 '''
19 Dummy Translation wrapper, just returning the same string.
20 '''
21 return to_unicode(str)
22
24 '''
25 Dummy Plural Translation wrapper, just returning the singular or plural
26 string.
27 '''
28 if n == 1:
29 return str1
30 else:
31 return str2
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
67 """ auxiliary function for binary search in interval table. """
68
69 min = 0
70 max = len(table) - 1
71 if ucs < table[min][0] or ucs > table[max][1]:
72 return False
73
74 while max >= min:
75 mid = (min + max) / 2
76 if ucs > table[mid][1]:
77 min = mid + 1
78 elif ucs < table[mid][0]:
79 max = mid - 1
80 else:
81 return True
82
83 return False
84
85
86
87
88 __combining = (
89 ( 0x0300, 0x036F ), ( 0x0483, 0x0486 ), ( 0x0488, 0x0489 ),
90 ( 0x0591, 0x05BD ), ( 0x05BF, 0x05BF ), ( 0x05C1, 0x05C2 ),
91 ( 0x05C4, 0x05C5 ), ( 0x05C7, 0x05C7 ), ( 0x0600, 0x0603 ),
92 ( 0x0610, 0x0615 ), ( 0x064B, 0x065E ), ( 0x0670, 0x0670 ),
93 ( 0x06D6, 0x06E4 ), ( 0x06E7, 0x06E8 ), ( 0x06EA, 0x06ED ),
94 ( 0x070F, 0x070F ), ( 0x0711, 0x0711 ), ( 0x0730, 0x074A ),
95 ( 0x07A6, 0x07B0 ), ( 0x07EB, 0x07F3 ), ( 0x0901, 0x0902 ),
96 ( 0x093C, 0x093C ), ( 0x0941, 0x0948 ), ( 0x094D, 0x094D ),
97 ( 0x0951, 0x0954 ), ( 0x0962, 0x0963 ), ( 0x0981, 0x0981 ),
98 ( 0x09BC, 0x09BC ), ( 0x09C1, 0x09C4 ), ( 0x09CD, 0x09CD ),
99 ( 0x09E2, 0x09E3 ), ( 0x0A01, 0x0A02 ), ( 0x0A3C, 0x0A3C ),
100 ( 0x0A41, 0x0A42 ), ( 0x0A47, 0x0A48 ), ( 0x0A4B, 0x0A4D ),
101 ( 0x0A70, 0x0A71 ), ( 0x0A81, 0x0A82 ), ( 0x0ABC, 0x0ABC ),
102 ( 0x0AC1, 0x0AC5 ), ( 0x0AC7, 0x0AC8 ), ( 0x0ACD, 0x0ACD ),
103 ( 0x0AE2, 0x0AE3 ), ( 0x0B01, 0x0B01 ), ( 0x0B3C, 0x0B3C ),
104 ( 0x0B3F, 0x0B3F ), ( 0x0B41, 0x0B43 ), ( 0x0B4D, 0x0B4D ),
105 ( 0x0B56, 0x0B56 ), ( 0x0B82, 0x0B82 ), ( 0x0BC0, 0x0BC0 ),
106 ( 0x0BCD, 0x0BCD ), ( 0x0C3E, 0x0C40 ), ( 0x0C46, 0x0C48 ),
107 ( 0x0C4A, 0x0C4D ), ( 0x0C55, 0x0C56 ), ( 0x0CBC, 0x0CBC ),
108 ( 0x0CBF, 0x0CBF ), ( 0x0CC6, 0x0CC6 ), ( 0x0CCC, 0x0CCD ),
109 ( 0x0CE2, 0x0CE3 ), ( 0x0D41, 0x0D43 ), ( 0x0D4D, 0x0D4D ),
110 ( 0x0DCA, 0x0DCA ), ( 0x0DD2, 0x0DD4 ), ( 0x0DD6, 0x0DD6 ),
111 ( 0x0E31, 0x0E31 ), ( 0x0E34, 0x0E3A ), ( 0x0E47, 0x0E4E ),
112 ( 0x0EB1, 0x0EB1 ), ( 0x0EB4, 0x0EB9 ), ( 0x0EBB, 0x0EBC ),
113 ( 0x0EC8, 0x0ECD ), ( 0x0F18, 0x0F19 ), ( 0x0F35, 0x0F35 ),
114 ( 0x0F37, 0x0F37 ), ( 0x0F39, 0x0F39 ), ( 0x0F71, 0x0F7E ),
115 ( 0x0F80, 0x0F84 ), ( 0x0F86, 0x0F87 ), ( 0x0F90, 0x0F97 ),
116 ( 0x0F99, 0x0FBC ), ( 0x0FC6, 0x0FC6 ), ( 0x102D, 0x1030 ),
117 ( 0x1032, 0x1032 ), ( 0x1036, 0x1037 ), ( 0x1039, 0x1039 ),
118 ( 0x1058, 0x1059 ), ( 0x1160, 0x11FF ), ( 0x135F, 0x135F ),
119 ( 0x1712, 0x1714 ), ( 0x1732, 0x1734 ), ( 0x1752, 0x1753 ),
120 ( 0x1772, 0x1773 ), ( 0x17B4, 0x17B5 ), ( 0x17B7, 0x17BD ),
121 ( 0x17C6, 0x17C6 ), ( 0x17C9, 0x17D3 ), ( 0x17DD, 0x17DD ),
122 ( 0x180B, 0x180D ), ( 0x18A9, 0x18A9 ), ( 0x1920, 0x1922 ),
123 ( 0x1927, 0x1928 ), ( 0x1932, 0x1932 ), ( 0x1939, 0x193B ),
124 ( 0x1A17, 0x1A18 ), ( 0x1B00, 0x1B03 ), ( 0x1B34, 0x1B34 ),
125 ( 0x1B36, 0x1B3A ), ( 0x1B3C, 0x1B3C ), ( 0x1B42, 0x1B42 ),
126 ( 0x1B6B, 0x1B73 ), ( 0x1DC0, 0x1DCA ), ( 0x1DFE, 0x1DFF ),
127 ( 0x200B, 0x200F ), ( 0x202A, 0x202E ), ( 0x2060, 0x2063 ),
128 ( 0x206A, 0x206F ), ( 0x20D0, 0x20EF ), ( 0x302A, 0x302F ),
129 ( 0x3099, 0x309A ), ( 0xA806, 0xA806 ), ( 0xA80B, 0xA80B ),
130 ( 0xA825, 0xA826 ), ( 0xFB1E, 0xFB1E ), ( 0xFE00, 0xFE0F ),
131 ( 0xFE20, 0xFE23 ), ( 0xFEFF, 0xFEFF ), ( 0xFFF9, 0xFFFB ),
132 ( 0x10A01, 0x10A03 ), ( 0x10A05, 0x10A06 ), ( 0x10A0C, 0x10A0F ),
133 ( 0x10A38, 0x10A3A ), ( 0x10A3F, 0x10A3F ), ( 0x1D167, 0x1D169 ),
134 ( 0x1D173, 0x1D182 ), ( 0x1D185, 0x1D18B ), ( 0x1D1AA, 0x1D1AD ),
135 ( 0x1D242, 0x1D244 ), ( 0xE0001, 0xE0001 ), ( 0xE0020, 0xE007F ),
136 ( 0xE0100, 0xE01EF ))
137
139 """ Get the textual width of a ucs character. """
140
141
142 if ucs == 0:
143 return 0
144
145 if ucs < 32 or (ucs >= 0x7f and ucs < 0xa0):
146 return (-1)
147
148 if __utf8_bisearch(ucs, __combining):
149 return 0
150
151
152
153 return (1 +
154 (ucs >= 0x1100 and
155 (ucs <= 0x115f or
156 ucs == 0x2329 or ucs == 0x232a or
157 (ucs >= 0x2e80 and ucs <= 0xa4cf and
158 ucs != 0x303f) or
159 (ucs >= 0xac00 and ucs <= 0xd7a3) or
160 (ucs >= 0xf900 and ucs <= 0xfaff) or
161 (ucs >= 0xfe10 and ucs <= 0xfe19) or
162 (ucs >= 0xfe30 and ucs <= 0xfe6f) or
163 (ucs >= 0xff00 and ucs <= 0xff60) or
164 (ucs >= 0xffe0 and ucs <= 0xffe6) or
165 (ucs >= 0x20000 and ucs <= 0x2fffd) or
166 (ucs >= 0x30000 and ucs <= 0x3fffd))))
167
168
170 for byte in to_utf8(msg):
171 yield ord(byte)
173 uiter = __utf8_iter_ints(msg)
174 for byte0 in uiter:
175 if byte0 < 0x80:
176 yield (byte0, 1)
177 elif (byte0 & 0xe0) == 0xc0:
178 byte1 = uiter.next()
179 if (((byte1 & 0xc0) != 0x80) or
180 ((byte0 & 0xfe) == 0xc0)):
181 yield (None, 2)
182 return
183 yield ((((byte0 & 0x1f) << 6) | (byte1 & 0x3f)), 2)
184 elif (byte0 & 0xf0) == 0xe0:
185 byte1 = uiter.next()
186 byte2 = uiter.next()
187 if (((byte1 & 0xc0) != 0x80) or ((byte2 & 0xc0) != 0x80) or
188 ((byte0 == 0xe0) and ((byte1 & 0xe0) == 0x80)) or
189 ((byte0 == 0xed) and ((byte1 & 0xe0) == 0xa0)) or
190 ((byte0 == 0xef) and (byte1 == 0xbf) and
191 ((byte2 & 0xfe) == 0xbe))):
192 yield (None, 3)
193 return
194 yield ((((byte0 & 0x0f) << 12) | ((byte1 & 0x3f) << 6) |
195 (byte2 & 0x3f)), 3)
196 elif (byte0 & 0xf8) == 0xf0:
197 byte1 = uiter.next()
198 byte2 = uiter.next()
199 byte3 = uiter.next()
200 if (((byte1 & 0xc0) != 0x80) or
201 ((byte2 & 0xc0) != 0x80) or
202 ((byte3 & 0xc0) != 0x80) or
203 ((byte0 == 0xf0) and ((byte1 & 0xf0) == 0x80)) or
204 ((byte0 == 0xf4) and (byte1 > 0x8f)) or
205 (byte0 > 0xf4)):
206 yield (None, 4)
207 return
208
209 yield ((((byte0 & 0x07) << 18) | ((byte1 & 0x3f) << 12) |
210 ((byte2 & 0x3f) << 6) | (byte3 & 0x3f)), 4)
211 else:
212 yield (None, 1)
213 return
214
216 """ Get the textual width of a utf8 string. """
217 ret = 0
218 for (ucs, bytes) in __utf8_iter_ucs(msg):
219 if ucs is None:
220 ret += bytes
221 else:
222 ret += __utf8_ucp_width(ucs)
223 return ret
224
226 """ Return the textual width of a utf8 string, chopping it to a specified
227 value. This is what you want to use instead of %.*s, as it does the
228 "right" thing with regard to utf-8 sequences. Eg.
229 "%.*s" % (10, msg) <= becomes => "%s" % (utf8_width_chop(msg, 10)) """
230
231 if chop is None or utf8_width(msg) <= chop:
232 return utf8_width(msg), msg
233
234 ret = 0
235 passed_unicode = isinstance(msg, unicode)
236 msg_bytes = 0
237 msg = to_utf8(msg)
238 for (ucs, bytes) in __utf8_iter_ucs(msg):
239 if ucs is None:
240 width = bytes
241 else:
242 width = __utf8_ucp_width(ucs)
243
244 if chop is not None and (ret + width) > chop:
245 msg = msg[:msg_bytes]
246 break
247 ret += width
248 msg_bytes += bytes
249
250 if passed_unicode:
251 msg = to_unicode(msg)
252
253 return ret, msg
254
255 -def utf8_width_fill(msg, fill, chop=None, left=True, prefix='', suffix=''):
256 """ Expand a utf8 msg to a specified "width" or chop to same.
257 Expansion can be left or right. This is what you want to use instead of
258 %*.*s, as it does the "right" thing with regard to utf-8 sequences.
259 prefix and suffix should be used for "invisible" bytes, like
260 highlighting.
261 Eg.
262 "%-*.*s" % (10, 20, msg)
263 <= becomes =>
264 "%s" % (utf8_width_fill(msg, 10, 20)).
265
266 "%20.10s" % (msg)
267 <= becomes =>
268 "%s" % (utf8_width_fill(msg, 20, 10, left=False)).
269
270 "%s%.10s%s" % (prefix, msg, suffix)
271 <= becomes =>
272 "%s" % (utf8_width_fill(msg, 0, 10, prefix=prefix, suffix=suffix)).
273 """
274 passed_msg = msg
275 width, msg = utf8_width_chop(msg, chop)
276
277 if width >= fill:
278 if prefix or suffix:
279 msg = ''.join([prefix, msg, suffix])
280 else:
281 extra = " " * (fill - width)
282 if left:
283 msg = ''.join([prefix, msg, suffix, extra])
284 else:
285 msg = ''.join([extra, prefix, msg, suffix])
286
287 if isinstance(passed_msg, unicode):
288 return to_unicode(msg)
289
290 return msg
291
293 """ Return True/False is the text is valid utf8. """
294 for (ucs, bytes) in __utf8_iter_ucs(msg):
295 if ucs is None:
296 return False
297 return True
298
300 """ Minor speed hack, we often want to know "does X fit in Y". It takes
301 "a while" to work out a utf8_width() (see above), and we know that a
302 utf8 character is always <= byte. So given:
303
304 assert bytes >= characters
305 characters <= width?
306
307 ...we can change to:
308
309 bytes <= width or characters <= width
310
311 ...and bytes are much faster. """
312
313 ret = 0
314 for arg in args:
315 ret += len(arg)
316 if ret <= width:
317 return True
318 ret = 0
319 for arg in args:
320 ret += utf8_width(arg)
321 return ret <= width
322
323 -def utf8_text_wrap(text, width=70, initial_indent='', subsequent_indent=''):
324 """ Works like we want textwrap.wrap() to work, uses utf-8 data and
325 doesn't screw up lists/blocks/etc. """
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340 passed_unicode = isinstance(text, unicode)
341
342 def _indent_at_beg(line):
343 count = 0
344 byte = 'X'
345 for byte in line:
346 if byte != ' ':
347 break
348 count += 1
349 if byte not in ("-", "*", ".", "o", '\xe2'):
350 return count, 0
351 list_chr = utf8_width_chop(line[count:], 1)[1]
352 if list_chr in ("-", "*", ".", "o",
353 "\xe2\x80\xa2", "\xe2\x80\xa3", "\xe2\x88\x98"):
354 nxt = _indent_at_beg(line[count+len(list_chr):])
355 nxt = nxt[1] or nxt[0]
356 if nxt:
357 return count, count + 1 + nxt
358 return count, 0
359
360 initial_indent = to_utf8(initial_indent)
361 subsequent_indent = to_utf8(subsequent_indent)
362
363 text = to_utf8(text).rstrip('\n')
364 lines = to_utf8(text).replace('\t', ' ' * 8).split('\n')
365
366 ret = []
367 indent = initial_indent
368 wrap_last = False
369 csab = 0
370 cspc_indent = 0
371 for line in lines:
372 line = line.rstrip(' ')
373 (lsab, lspc_indent) = (csab, cspc_indent)
374 (csab, cspc_indent) = _indent_at_beg(line)
375 force_nl = False
376 if wrap_last and cspc_indent:
377 force_nl = True
378 if wrap_last and csab == len(line):
379 force_nl = True
380 if wrap_last and not lspc_indent:
381 if csab >= 4 and csab != lsab:
382 force_nl = True
383 if force_nl:
384 ret.append(indent.rstrip(' '))
385 indent = subsequent_indent
386 wrap_last = False
387 if csab == len(line):
388 line = ''
389 if wrap_last:
390 line = line.lstrip(' ')
391 cspc_indent = lspc_indent
392
393 if _utf8_width_le(width, indent, line):
394 wrap_last = False
395 ret.append(indent + line)
396 indent = subsequent_indent
397 continue
398
399 wrap_last = True
400 words = line.split(' ')
401 line = indent
402 spcs = cspc_indent
403 if not spcs and csab >= 4:
404 spcs = csab
405 for word in words:
406 if (not _utf8_width_le(width, line, word) and
407 utf8_width(line) > utf8_width(subsequent_indent)):
408 ret.append(line.rstrip(' '))
409 line = subsequent_indent + ' ' * spcs
410 line += word
411 line += ' '
412 indent = line.rstrip(' ') + ' '
413 if wrap_last:
414 ret.append(indent.rstrip(' '))
415
416 if passed_unicode:
417 return map(to_unicode, ret)
418 return ret
419
420 -def utf8_text_fill(text, *args, **kwargs):
421 """ Works like we want textwrap.fill() to work, uses utf-8 data and
422 doesn't screw up lists/blocks/etc. """
423 return '\n'.join(utf8_text_wrap(text, *args, **kwargs))
424
425
426 -def to_unicode(obj, encoding='utf-8', errors='replace'):
427 ''' convert a 'str' to 'unicode' '''
428 if isinstance(obj, basestring):
429 if not isinstance(obj, unicode):
430 obj = unicode(obj, encoding, errors)
431 return obj
432
433 -def to_utf8(obj, errors='replace'):
434 '''convert 'unicode' to an encoded utf-8 byte string '''
435 if isinstance(obj, unicode):
436 obj = obj.encode('utf-8', errors)
437 return obj
438
439
441 ''' Don't ask don't tell, only use when you must '''
442 try:
443 return to_unicode(obj, encoding, errors)
444 except UnicodeEncodeError:
445 return obj
446
448 """ Convert something to a string, if it isn't one. """
449
450
451 if not isinstance(obj, basestring):
452 obj = str(obj)
453 return obj
454
456 """ convert between unicode and not and compare them, w/o warning or being annoying"""
457 if isinstance(a, unicode) == isinstance(b, unicode):
458 if a == b:
459 return True
460 elif to_utf8(a) == to_utf8(b):
461 return True
462
463 return False
464
465 try:
466 '''
467 Setup the yum translation domain and make _() and P_() translation wrappers
468 available.
469 using ugettext to make sure translated strings are in Unicode.
470 '''
471 import gettext
472 t = gettext.translation('yum', fallback=True)
473 _ = t.ugettext
474 P_ = t.ungettext
475 except:
476 '''
477 Something went wrong so we make a dummy _() wrapper there is just
478 returning the same text
479 '''
480 _ = dummy_wrapper
481 P_ = dummyP_wrapper
482
483 if __name__ == "__main__":
484 import sys
485
487 arg = to_utf8(arg)
488 print "UTF8 :", arg
489 print "len :", len(arg)
490 arg = to_unicode(arg)
491 print "USC :", arg
492 print "len :", len(arg)
493 print "valid:", utf8_valid(arg)
494 print "width:", utf8_width(arg)
495 print "4.8 :", "%s%s%s" % ('<', utf8_width_fill(arg, 4, 8), '>')
496 print "4.3 :", "%s%s%s" % ('<', utf8_width_fill(arg, 4, 3), '>')
497 print "4.2 :", "%s%s%s" % ('<', utf8_width_fill(arg, 4, 2), '>')
498 print "4.1 :", "%s%s%s" % ('<', utf8_width_fill(arg, 4, 1), '>')
499 print "3.3 :", "%s%s%s" % ('<', utf8_width_fill(arg, 3, 3), '>')
500 print "3.2 :", "%s%s%s" % ('<', utf8_width_fill(arg, 3, 2), '>')
501 print "3.1 :", "%s%s%s" % ('<', utf8_width_fill(arg, 3, 1), '>')
502 print "40.79:", "%s%s%s" % ('<', utf8_width_fill(arg, 40, 79), '>')
503 print "40.20:", "%s%s%s" % ('<', utf8_width_fill(arg, 40, 20), '>')
504 print ''
505
506 print " ---- Arguments/str ---- "
507 for arg in sys.argv[1:]:
508 out(arg)
509
510 print " ---- Arguments/gettext ---- "
511 for arg in sys.argv[1:]:
512 try:
513 arg = _(arg)
514 except UnicodeDecodeError:
515 continue
516 out(arg)
517
518 if len(sys.argv) > 2:
519 print " ---- Arguments/str/all ---- "
520 out(sys.argv[1] % sys.argv[2:])
521
522 print " ---- Arguments/gettext/all ---- "
523 try:
524 arg = _(sys.argv[1]) % map(_, sys.argv[2:])
525 except UnicodeDecodeError:
526 sys.exit(0)
527 out(arg)
528