 |
|
|  |
ÆÄÀ̽㿡¼ À¯´ÏÄÚµå ½ºÆ®¸² ´Ù·ç±â |
|
|
 |
8³â Àü |
ÆÄÀ̽㿡¼ À¯´ÏÄڵ带 ´Ù·ê ¶§´Â ÀϹÝÀûÀ¸·Î str.decode()¿Í unicode.encode() ¸Þ¼µå¸¦ »ç¿ëÇÏ¿© unicode ŸÀÔ°ú str ŸÀÔÀ» »óÈ£ º¯È¯ÇÑ´Ù.
¾Æ·¡ ¿¹½Ã¿¡¼´Â 'utf-16'À¸·Î ÀÛ¼ºµÈ ÆÄÀÏÀ» ¿¾î, ¼öÁ÷ ÅÇ(vertical tab) ÄÚµåÆ÷ÀÎÆ®¸¦ Áö¿î ´ÙÀ½, 'utf-8'·Î ÀúÀåÇÑ´Ù. (±úÁø XMLÀ» ´Ù·ê ¶§ ÀÌ ¹æ½ÄÀÌ ¸Å¿ì Áß¿äÇÏ´Ù.)
# ÆÄÀÏ ³»¿ëÀ» Àд´Ù
with open("input.txt", "rb") as input:
data = input.read()
# ¹ÙÀ̳ʸ® µ¥ÀÌÅ͸¦ utf-16À¸·Î µðÄÚµùÇÑ´Ù
data = data.decode("utf-16")
# ¼öÁ÷ ÅÇÀ» »èÁ¦ÇÑ´Ù
data = data.replace(u"\u000B", u"")
# À¯´ÏÄÚµå µ¥ÀÌÅ͸¦ utf-8·Î ÀÎÄÚµùÇÑ´Ù
data = data.encode("utf-8")
# µ¥ÀÌÅ͸¦ utf-8·Î ÀúÀåÇÑ´Ù
with open("output.txt", "wb") as output:
output.write(data)
¾öû³ª°Ô Å« ÆÄÀÏÀ» ´Ù·ê ¶§°¡ ¾Æ´Ï¶ó¸é ÀÌ Á¤µµ·Îµµ ÃæºÐÇÏ´Ù. ÇÏÁö¸¸ Å« ÆÄÀÏÀ» ´Ù·ê ¶© ¸ðµç µ¥ÀÌÅÍ°¡ ¸Þ¸ð¸®¿¡ ¿Ã¶ó°£´Ù´Â »ç½ÇÀÌ ¹®Á¦°¡ µÈ´Ù.
½ºÆ®¸®¹Ö ÀÎÄÚ´õ/µðÄÚ´õ »ç¿ëÇϱâ
ÆÄÀ̽㠱⺻ ¶óÀ̺귯¸®¿¡´Â codecs ¸ðµâÀÌ Æ÷ÇԵǾî ÀÖ´Ù. ÀÌ ¸ðµâÀ» »ç¿ëÇϸé ÆÄÀÏÀ» Á¶±Ý¾¿ ÀÐÀ» ¼ö ÀÖ°í, ¸Þ¸ð¸®¿¡µµ ¾à°£ÀÇ À¯´ÏÄÚµå µ¥ÀÌÅ͸¸ ¿Ã¶ó°¡°Ô µÈ´Ù.
codecs.open() ÇïÆÛ ¸Þ¼µå¸¦ »ç¿ëÇÏ¿© À§ÀÇ ¿¹½Ã¸¦ ÃÖ¼ÒÇѸ¸ °íÃĺ¸ÀÚ.
import codecs
# ÀÔ·Â ½ºÆ®¸²°ú Ãâ·Â ½ºÆ®¸²À» ¿¬´Ù
input = codecs.open("input.txt", "rb", encoding="utf-16")
output = codecs.open("output.txt", "wb", encoding="utf-8")
# À¯´ÏÄÚµå µ¥ÀÌÅÍ Á¶°¢µéÀ» ½ºÆ®¸®¹ÖÇÑ´Ù
with input, output:
while True:
# µ¥ÀÌÅÍ Á¶°¢À» Àаí
chunk = input.read(4096)
if not chunk:
break
# ¼öÁ÷ ÅÇÀ» »èÁ¦ÇÑ´Ù
chunk = chunk.replace(u"\u000B", u"")
# µ¥ÀÌÅÍ Á¶°¢À» ¾´´Ù
output.write(chunk)
ÆÄÀÏÀº ²ûÂïÇØ! ÀÌÅÍ·¹ÀÌÅÍ »ç¿ëÇϱâ
ÆÄÀÏÀº ´Ù·ç±â°¡ Á» Áö·çÇÏ´Ù. º¹ÀâÇÑ Ã³¸® °úÁ¤¿¡´Â À¯´ÏÄÚµå µ¥ÀÌÅÍÀÇ ÀÌÅÍ·¹ÀÌÅ͸¦ ´Ù·ç´Â ÆíÀÌ ±ò²ûÇÒ °ÍÀÌ´Ù.
¾Æ·¡´Â iterdecode()¸¦ »ç¿ëÇÏ¿©, ÆÄÀÏÀ» À¯´ÏÄÚµå µ¥ÀÌÅÍ Á¶°¢ÀÇ ÀÌÅÍ·¹ÀÌÅÍ·Î Àд ȿ°úÀûÀÎ ¹æ¹ýÀÌ´Ù.
from functools import partial
from codecs import iterdecode
# ƯÁ¤ pathÀÇ ÆÄÀÏÀ» À¯´ÏÄÚµå Á¶°¢ÀÇ ÀÌÅÍ·¹ÀÌÅÍ·Î ¸®ÅÏÇÑ´Ù
def iter_unicode_chunks(path, encoding):
# ÀÐÀ» ÆÄÀÏÀ» ¿¬´Ù
with open(path, "rb") as input:
# ¹ÙÀ̳ʸ® ÆÄÀÏÀ» ¹ÙÀ̳ʸ® Á¶°¢À¸·Î º¯È¯ÇÑ´Ù
binary_chunks = iter(partial(input.read, 1), "")
# ¹ÙÀ̳ʸ® Á¶°¢À» À¯´ÏÄÚµå Á¶°¢À¸·Î º¯È¯ÇÑ´Ù
for unicode_chunk in iterdecode(binary_chunks, encoding):
yield unicode_chunk
ÀÌÁ¦ iterencode() ¸Þ¼µå¸¦ »ç¿ëÇÏ¿©, À¯´ÏÄÚµå Á¶°¢ÀÇ ÀÌÅÍ·¹ÀÌÅ͸¦ ÆÄÀÏ¿¡ ½áº¸ÀÚ.
from codecs import iterencode
# À¯´ÏÄÚµå Á¶°¢ÀÇ ÀÌÅÍ·¹ÀÌÅ͸¦ ƯÁ¤ pathÀÇ ÆÄÀÏ¿¡ ¾´´Ù
def write_unicode_chunks(path, unicode_chunks, encoding):
# ¾µ ÆÄÀÏÀ» ¿¬´Ù
with open(path, "wb") as output:
# À¯´ÏÄÚµå Á¶°¢À» ¹ÙÀ̳ʸ®·Î º¯È¯ÇÑ´Ù
for binary_chunk in iterencode(unicode_chunks, encoding):
output.write(binary_chunk)
ÀÌ µÎ ÇÔ¼ö¿Í ÇÔ²² À¯´ÏÄÚµå µ¥ÀÌÅÍÀÇ ½ºÆ®¸²¿¡¼ ¼öÁ÷ ÅÇÀ» ¾ø¾Ö´Â ÀÏÀÌ ¸¶¹ý °°ÀÌ ³¡³´Ù(just becomes a case of plumbing everything together).
# ÆÄÀÏÀ» À¯´ÏÄÚµå Á¶°¢ ÇüÅ·ΠÀд´Ù
unicode_chunks = iter_unicode_chunks("input.txt", encoding="utf-16")
# À¯´ÏÄÚµå Á¶°¢À» ¼öÁ¤ÇÑ´Ù
unicode_chunks = (
chunk.replace(u"\u000B", u"")
for chunk
in unicode_chunks
)
# À¯´ÏÄÚµå Á¶°¢À» ÆÄÀÏ¿¡ ÀúÀåÇÑ´Ù
write_unicode_chunks("output.txt", unicode_chunks, encoding="utf-8")
°ÅâÇÏ°Ô codecs ¸ðµâÀ» »ç¿ëÇØ¾ß ÇÒ±î?
¾óÇÍ ±×³É, str.decode()¿Í unicode.encode() ¸Þ¼µå¸¦ »ç¿ëÇÏ¿© Å« file °´Ã¼¸¦ ¹ÙÀ̳ʸ® Á¶°¢À¸·Î Àаí, ÀÎÄÚµùÇÏ°í µðÄÚµùÇÏ´Â ÆíÀÌ °£´ÜÇÏ´Ù°í »ý°¢ÇÒ ¼öµµ ÀÖ°Ú´Ù.
# ³ª»Û ¿¹½Ã. ÀÌ·¸°Ô ÇÏÁö ¸¶½Ã¿À!
# ÀÔ·Â ½ºÆ®¸²°ú Ãâ·Â ½ºÆ®¸²À» ¿¬´Ù
with open("input.txt", "rb") as input, open("output.txt", "wb") as output:
# ¹ÙÀ̳ʸ® µ¥ÀÌÅÍ Á¶°¢µéÀ» ¼øȸÇÑ´Ù
while True:
# µ¥ÀÌÅÍ Á¶°¢À» Àд´Ù
chunk = input.read(4096)
if not chunk:
break
# À§Çè: ¹ÙÀ̳ʸ® µ¥ÀÌÅ͸¦ utf-16À¸·Î µðÄÚµùÇÑ´Ù
chunk = chunk.decode("utf-16")
# ¼öÁ÷ ÅÇÀ» »èÁ¦ÇÑ´Ù
chunk = chunk.replace(u"\u000B", u"")
# À¯´ÏÄÚµå µ¥ÀÌÅ͸¦ utf-8·Î ÀÎÄÚµùÇÑ´Ù
chunk = chunk.encode("utf-8")
# µ¥ÀÌÅÍ Á¶°¢À» ¾´´Ù
output.write(chunk)
ºÒÇàÈ÷µµ ¸î¸î À¯´ÏÄÚµå ÄÚµåÆ÷ÀÎÆ®´Â ¹ÙÀ̳ʸ® µ¥ÀÌÅÍÀÇ ÇÑ ¹ÙÀÌÆ® ÀÌ»óÀ¸·Î ÀÎÄÚµùµÈ´Ù. µû¶ó¼ ´Ü¼øÈ÷ ÆÄÀÏ¿¡¼ ¹ÙÀÌÆ® Á¶°¢µéÀ» ÀÐ¾î¼ decode() ¸Þ¼µå¸¦ Àû¿ëÇÏ¸é ¿¹±âÄ¡ ¾Ê°Ô UnicodeDecodeError°¡ ¹ß»ýÇÒ ¼öµµ ÀÖ´Ù. ÀÌ´Â ¹ÙÀÌÆ® ÇÑ Á¶°¢ÀÌ ¿©·¯ ¹ÙÀÌÆ®ÀÇ ÄÚµåÆ÷ÀÎÆ®·Î ºÐ¸®µÇ¾ú±â ¶§¹®ÀÌ´Ù.
codecs ¸ðµâÀÇ µµ±¸µéÀ» »ç¿ëÇϸé ÀÌ·¯ÇÑ ¿¹±âÄ¡ ¾ÊÀº Ãæµ¹À» ¿¹¹æÇÒ ¼ö ÀÖ´Ù.
ÆÄÀ̽ã 3¿¡¼´Â?
ÆÄÀ̽ã 3¿¡¼´Â ÈξÀ ´Ü¼øÇÏ°Ô À¯´ÏÄÚµå ÆÄÀÏÀ» ´Ù·ê ¼ö ÀÖ´Ù. ºôÆ®ÀÎ ¸Þ¼µåÀÎ open()Àº À¯´ÏÄÚµå µ¥ÀÌÅ͸¦ ¼öÁ¤Çϰųª ÀÎÄÚµùÀ» º¯°æÇÏ´Â µ¥ ÇÊ¿äÇÑ ±â´ÉÀ» Æ÷ÇÔÇÏ°í ÀÖ´Ù.
# ÀÔ·Â ½ºÆ®¸²°ú Ãâ·Â ½ºÆ®¸²À» ¿¬´Ù
input = open("input.txt", "rt", encoding="utf-16")
output = open("output.txt", "wt", encoding="utf-8")
# À¯´ÏÄÚµå µ¥ÀÌÅÍ Á¶°¢µéÀ» ½ºÆ®¸®¹ÖÇÑ´Ù
with input, output:
while True:
# µ¥ÀÌÅÍ Á¶°¢À» Àаí
chunk = input.read(4096)
if not chunk:
break
# ¼öÁ÷ ÅÇÀ» »èÁ¦ÇÑ´Ù
chunk = chunk.replace("\u000B", "")
# µ¥ÀÌÅÍ Á¶°¢À» ¾´´Ù
output.write(chunk)
ÆÄÀ̽ã 3ÀÇ ½Ã´ë´Ù! Áñ°Ì°Ô ÄÚµùÇϱæ! |
|
̵̧ : 380 |
̵̧
¸ñ·Ï
|
|
|  |
|