파이썬 소스 한글 인코딩 지정(펌)

파이썬 소스 한글 인코딩 지정(펌)

2010. 4. 7. 16:03ㆍ프로그래밍언어/PYTHON

PYTHON ERROR

파이썬 소스에 한글 문자열이 있으면 다음과 같은 "SyntaxError: Non-ASCII character..."라는 에러가 납니다:

File "D:\Z\0.py", line 4

SyntaxError: Non-ASCII character '\x8c' in file D:\Z\0.py on line 4, but no encoding declared; see http://www.python.org/peps/pep-0263.html for details

파이썬은 영문 인코딩이 기본이기에, 한글을 표현하기 위해서는 한글 인코딩을 지정해 주어야 합니다.

# -*- coding: 949 -*-

위와 같은 행을, 파이썬 소스의 반드시 "1번째 줄" 또는 "2번째 줄"에 위치시켜야 합니다. 3번째 줄부터는 안됩니다.

또한

coding:

이곳의 콜론(:)기호를

coding :

이렇게 띄어쓰면 안됩니다.

파이썬 소스 한글 인코딩 지정 예제

파일명: 0.py

#!/usr/bin/python

# -*- coding: 949 -*-

print "Hello World! 똠방각하"

# 이 줄은 주석문(코멘트; Comment)입니다.

# -*- coding: cp949 -*-

또는

# -*- coding: ms949 -*-

라고 해도 됩니다.

그러나

# -*- coding: euc-kr -*-

이라고 하면, "똠방각하"의 "똠"자 등이 표현되지 않습니다.

"똠"자를 처리하지 못하고 이런 에러가 납니다: SyntaxError: 'euc_kr' codec can't decode bytes in position 20-21: illegal multibyte sequence

즉 확장완성형이 아닌 그냥 완성형으로 지정됩니다.

디렉토리의 파일목록 얻기

glob모듈의 glob함수 사용: 와일드카드문자 ?(임의의문자 1개와 매칭),* 를 지원한다.

import glob
glob.glob('./[0-9].*')

파일의 추가정보 알아내기

isfile(path)- path가 일반 파일이면 True 리턴

isdir(path)- path가 디렉토리이면 True 리턴

islink(path), ismount(path), exist(path)

파일크기, 접근시간

경로명 다루기
import os
p = os.path.abspath('LogMergeServer.py') #상대경로를 절대경로로
p
'/hanmail/wap_log/LogMergeServer.py'
os.path.split(p) #(디렉토리와 파일을 분리)
('/hanmail/wap_log', 'LogMergeServer.py')
URL다루기

urlparse함수는 아래와 같이 분리하여 튜플을 리턴

(addressing, scheme, network location, path, parameter, query, fragment, identifier)

ex)
import urlparse
a='http://3355.daum.net/daum/search/top/TotalList?q=스파이더맨3&skey='
x=urlparse.urlparse(a)
x
('http', '3355.daum.net', '/daum/search/top/TotalList', '', 'q=\xbd\xba\xc6\xc4\xc0\xcc\xb4\xf5\xb8\xc73&skey=', '')
urlparse.urlunparse(x)
'http://3355.daum.net/daum/search/top/TotalList?q=\xbd\xba\xc6\xc4\xc0\xcc\xb4\xf5\xb8\xc73&skey='
urlparse.urljoin(a, 'SearchTop')
'http://3355.daum.net/'

Python regex

   text = '''128.134.98.74 - - [03/Jul/2007:17:03:01 +0900] "GET /daum/search/top/TotalList?q=\xc4\xab\xb5\xe5\xb2\xa4&access=KTF_SEARCH HTTP/1.1" 200 10949 "-" "Mozilla/1.22 (compatible;KUN/2.1.1; EV-K100; CellPhone)" "-" "8201096336669" "-"'''

    text = urllib.unquote(text)

    url = text.split("\"")[1].split(" ")[1]

    url = url.decode()



    pattern = r'^/\w+/\w+/(\w+)/\w.*q=([^&\xA1-\xFE]+).+?'

    p = re.compile(pattern)

    #p = re.compile(r'^/\w+/\w+/(\w+)/\w.*q=([^(\xa1-\xfe0-9a-zA-Z]+).+?')

    m = p.match(url)

    print url

    print '------------------------------------------------------'

    print m.group(0)

    print m.group(1)

    q = m.group(2)

    print q

# -*- coding: euc-kr -*-

import urllib

import re

import time

class parseLog:

    def __init__(self):

        self.today = time.strftime('%y%m%d')

        self.infile = open(self.today + '.search_log', 'r')

        self.outfile = open(self.today + '.search_map', 'a')



    def extractUrl(self):

        self.urls=[]

        for line in self.infile:

            line = urllib.unquote(line)

            self.urls.append(line.split("\"")[1].split(" ")[1].decode())

    def parseUrl(self):

        self.collMap = []

        for url in self.urls:

            #print url

            p = re.compile(r'^/\w+/\w+/(\w+)/\w.*q=([^&\xA1-\xFE]+).+?')

            m = p.match(url)

            collection = m.group(1)

            keyword = m.group(2)

            #print collection, keyword

            self.collMap.append((collection, keyword))











if __name__== "__main__":

    p = parseLog()

    p.extractUrl()

    p.parseUrl()

    print p.collMap
# # -*- coding: euc-kr -*-
import urllib
import re
import time
from array import *
class CollectionInfo:
    cafe = 0
    know = 0
    blog = 0
    news = 0
    image = 0
    web = 0
    keyword = ''
    def setKeyword(self, keyword):
        self.keyword = keyword
    def addCafe(self):
        self.cafe = self.cafe+1
    def addKnow(self):
        self.know = self.know+1
    def addBlog(self):
        self.blog = self.blog+1
    def addNews(self):
        self.news = self.news+1
    def addImage(self):
        self.image = self.image + 1
    def addWeb(self):
        self.web = self.web + 1
    def printColl(self):
        print 'cafe:',self.cafe,'/','blog:',self.blog,'/','know:',self.know,'/', 'news:',self.news,'/', 'image:',self.image,'/', 'web:',self.web


class ParseLog:
    def __init__(self):
        self.today = time.strftime('%y%m%d')
        self.infile = open(self.today + '.search_log', 'r')
        self.outfile = open(self.today + '.search_map', 'a')
        self.coldic = {}

    def extractUrl(self):
        self.urls=[]
        for line in self.infile:
            line.decode()
            line = urllib.unquote(line)

            self.urls.append(line.split("\"")[1].split(" ")[1])
    '''/daum/search/blog/BlogList?q=%C7%CF%B5%CE%B8%AE%B3%E0&skey='''
    def extractKeyword(self, url):
        if url.find('?q=') > 0:
            index = url.find('?q=')
        if url.find('&q=') > 0:
            index = url.find('&q=')

        q = url[index+3:]

        if q.find('&') > -1:
            q = q[:q.find('&')]
        return q
    def setCollectionInfo(self, collection, keyword):
        col=CollectionInfo()
        if 'cafe' == collection:
            col.addCafe()
        elif 'blog' == collection:
            col.addBlog()
        elif 'know' == collection:
            col.addKnow()
        elif 'news' == collection:
            col.addNews()
        elif 'image' == collection:
            col.addImage()
        elif 'web' == collection:
            col.addWeb()

        return col

    def setKeywordStat(self, collection, keyword, isFirst = False):
        if(isFirst):
            self.coldic[keyword] = self.setCollectionInfo(collection,keyword)
        else:
            if keyword in self.coldic:
                if 'cafe' == collection:
                    self.coldic[keyword].addCafe()
                elif 'blog' == collection:
                    self.coldic[keyword].addBlog()
                elif 'know' == collection:
                    self.coldic[keyword].addKnow()
                elif 'news' == collection:
                    self.coldic[keyword].addNews()
                elif 'image' == collection:
                    self.coldic[keyword].addImage()
                elif 'web' == collection:
                    self.coldic[keyword].addWeb()
            else:
                self.coldic[keyword] = self.setCollectionInfo(collection,keyword)
    def parseUrl(self):
        self.collMap = []
        colPattern = re.compile(r'^/.*(cafe|know|blog|image|dr|news)/.*')
        i=0
        for url in self.urls:
            colMat = colPattern.search(url)
            collection = colMat.group(1)
            keyword = self.extractKeyword(url)
            if keyword is not '':
                keyword = keyword.replace('+','')#공백제거
                print url
                print keyword
                if i == 0:
                    self.setKeywordStat(collection, keyword, True)
                else:
                    self.setKeywordStat(collection, keyword)
                i=i+1
                #setKeywordStat(collection,keyword,i)
                self.collMap.append((collection, keyword))

if __name__== "__main__":
    p = ParseLog()
    p.extractUrl()
    p.parseUrl()
    print '======================================'
    for keyword in p.coldic.keys():
        print keyword, p.coldic[keyword].printColl()


Pasted from <http://joyungki.springnote.com/pages/227536>

'프로그래밍언어 > PYTHON' 카테고리의 다른 글

Python 이클립스 플러그인 Pydev 사용기 (0)	2010.03.12

Python 이클립스 플러그인 Pydev 사용기 2010.03.12

Long's Life In the South of Korea

Long's Life In the South of Korea

태그

최근글

댓글

공지사항

아카이브

'프로그래밍언어 > PYTHON' 카테고리의 다른 글

관련글

티스토리툴바