Package openid :: Module fetchers
[frames] | no frames]

Source Code for Module openid.fetchers

  1  # -*- test-case-name: openid.test.test_fetchers -*- 
  2  """ 
  3  This module contains the HTTP fetcher interface and several implementations. 
  4  """ 
  5   
  6  __all__ = ['fetch', 'getDefaultFetcher', 'setDefaultFetcher', 'HTTPResponse', 
  7             'HTTPFetcher', 'createHTTPFetcher', 'HTTPFetchingError', 
  8             'HTTPError'] 
  9   
 10  import urllib2 
 11  import time 
 12  import cStringIO 
 13  import sys 
 14   
 15  import openid 
 16  import openid.urinorm 
 17   
 18  # Try to import httplib2 for caching support 
 19  # http://bitworking.org/projects/httplib2/ 
 20  try: 
 21      import httplib2 
 22  except ImportError: 
 23      # httplib2 not available 
 24      httplib2 = None 
 25   
 26  # try to import pycurl, which will let us use CurlHTTPFetcher 
 27  try: 
 28      import pycurl 
 29  except ImportError: 
 30      pycurl = None 
 31   
 32  USER_AGENT = "python-openid/%s (%s)" % (openid.__version__, sys.platform) 
 33   
34 -def fetch(url, body=None, headers=None):
35 """Invoke the fetch method on the default fetcher. Most users 36 should need only this method. 37 38 @raises Exception: any exceptions that may be raised by the default fetcher 39 """ 40 fetcher = getDefaultFetcher() 41 return fetcher.fetch(url, body, headers)
42
43 -def createHTTPFetcher():
44 """Create a default HTTP fetcher instance 45 46 prefers Curl to urllib2.""" 47 if pycurl is None: 48 fetcher = Urllib2Fetcher() 49 else: 50 fetcher = CurlHTTPFetcher() 51 52 return fetcher
53 54 # Contains the currently set HTTP fetcher. If it is set to None, the 55 # library will call createHTTPFetcher() to set it. Do not access this 56 # variable outside of this module. 57 _default_fetcher = None 58
59 -def getDefaultFetcher():
60 """Return the default fetcher instance 61 if no fetcher has been set, it will create a default fetcher. 62 63 @return: the default fetcher 64 @rtype: HTTPFetcher 65 """ 66 global _default_fetcher 67 68 if _default_fetcher is None: 69 setDefaultFetcher(createHTTPFetcher()) 70 71 return _default_fetcher
72
73 -def setDefaultFetcher(fetcher, wrap_exceptions=True):
74 """Set the default fetcher 75 76 @param fetcher: The fetcher to use as the default HTTP fetcher 77 @type fetcher: HTTPFetcher 78 79 @param wrap_exceptions: Whether to wrap exceptions thrown by the 80 fetcher wil HTTPFetchingError so that they may be caught 81 easier. By default, exceptions will be wrapped. In general, 82 unwrapped fetchers are useful for debugging of fetching errors 83 or if your fetcher raises well-known exceptions that you would 84 like to catch. 85 @type wrap_exceptions: bool 86 """ 87 global _default_fetcher 88 if fetcher is None or not wrap_exceptions: 89 _default_fetcher = fetcher 90 else: 91 _default_fetcher = ExceptionWrappingFetcher(fetcher)
92
93 -def usingCurl():
94 """Whether the currently set HTTP fetcher is a Curl HTTP fetcher.""" 95 return isinstance(getDefaultFetcher(), CurlHTTPFetcher)
96
97 -class HTTPResponse(object):
98 """XXX document attributes""" 99 headers = None 100 status = None 101 body = None 102 final_url = None 103
104 - def __init__(self, final_url=None, status=None, headers=None, body=None):
105 self.final_url = final_url 106 self.status = status 107 self.headers = headers 108 self.body = body
109
110 - def __repr__(self):
111 return "<%s status %s for %s>" % (self.__class__.__name__, 112 self.status, 113 self.final_url)
114
115 -class HTTPFetcher(object):
116 """ 117 This class is the interface for openid HTTP fetchers. This 118 interface is only important if you need to write a new fetcher for 119 some reason. 120 """ 121
122 - def fetch(self, url, body=None, headers=None):
123 """ 124 This performs an HTTP POST or GET, following redirects along 125 the way. If a body is specified, then the request will be a 126 POST. Otherwise, it will be a GET. 127 128 129 @param headers: HTTP headers to include with the request 130 @type headers: {str:str} 131 132 @return: An object representing the server's HTTP response. If 133 there are network or protocol errors, an exception will be 134 raised. HTTP error responses, like 404 or 500, do not 135 cause exceptions. 136 137 @rtype: L{HTTPResponse} 138 139 @raise Exception: Different implementations will raise 140 different errors based on the underlying HTTP library. 141 """ 142 raise NotImplementedError
143
144 -def _allowedURL(url):
145 return url.startswith('http://') or url.startswith('https://')
146
147 -class HTTPFetchingError(Exception):
148 """Exception that is wrapped around all exceptions that are raised 149 by the underlying fetcher when using the ExceptionWrappingFetcher 150 151 @ivar why: The exception that caused this exception 152 """
153 - def __init__(self, why=None):
154 Exception.__init__(self, why) 155 self.why = why
156
157 -class ExceptionWrappingFetcher(HTTPFetcher):
158 """Fetcher that wraps another fetcher, causing all exceptions 159 160 @cvar uncaught_exceptions: Exceptions that should be exposed to the 161 user if they are raised by the fetch call 162 """ 163 164 uncaught_exceptions = (SystemExit, KeyboardInterrupt, MemoryError) 165
166 - def __init__(self, fetcher):
167 self.fetcher = fetcher
168
169 - def fetch(self, *args, **kwargs):
170 try: 171 return self.fetcher.fetch(*args, **kwargs) 172 except self.uncaught_exceptions: 173 raise 174 except: 175 exc_cls, exc_inst = sys.exc_info()[:2] 176 if exc_inst is None: 177 # string exceptions 178 exc_inst = exc_cls 179 180 raise HTTPFetchingError(why=exc_inst)
181
182 -class Urllib2Fetcher(HTTPFetcher):
183 """An C{L{HTTPFetcher}} that uses urllib2. 184 """
185 - def fetch(self, url, body=None, headers=None):
186 if not _allowedURL(url): 187 raise ValueError('Bad URL scheme: %r' % (url,)) 188 189 if headers is None: 190 headers = {} 191 192 headers.setdefault( 193 'User-Agent', 194 "%s Python-urllib/%s" % (USER_AGENT, urllib2.__version__,)) 195 196 req = urllib2.Request(url, data=body, headers=headers) 197 try: 198 f = urllib2.urlopen(req) 199 try: 200 return self._makeResponse(f) 201 finally: 202 f.close() 203 except urllib2.HTTPError, why: 204 try: 205 return self._makeResponse(why) 206 finally: 207 why.close()
208
209 - def _makeResponse(self, urllib2_response):
210 resp = HTTPResponse() 211 resp.body = urllib2_response.read() 212 resp.final_url = urllib2_response.geturl() 213 resp.headers = dict(urllib2_response.info().items()) 214 215 if hasattr(urllib2_response, 'code'): 216 resp.status = urllib2_response.code 217 else: 218 resp.status = 200 219 220 return resp
221
222 -class HTTPError(HTTPFetchingError):
223 """ 224 This exception is raised by the C{L{CurlHTTPFetcher}} when it 225 encounters an exceptional situation fetching a URL. 226 """ 227 pass
228 229 # XXX: define what we mean by paranoid, and make sure it is.
230 -class CurlHTTPFetcher(HTTPFetcher):
231 """ 232 An C{L{HTTPFetcher}} that uses pycurl for fetching. 233 See U{http://pycurl.sourceforge.net/}. 234 """ 235 ALLOWED_TIME = 20 # seconds 236
237 - def __init__(self):
238 HTTPFetcher.__init__(self) 239 if pycurl is None: 240 raise RuntimeError('Cannot find pycurl library')
241
242 - def _parseHeaders(self, header_file):
243 header_file.seek(0) 244 245 # Remove the status line from the beginning of the input 246 unused_http_status_line = header_file.readline() 247 lines = [line.strip() for line in header_file] 248 249 # and the blank line from the end 250 empty_line = lines.pop() 251 if empty_line: 252 raise HTTPError("No blank line at end of headers: %r" % (line,)) 253 254 headers = {} 255 for line in lines: 256 try: 257 name, value = line.split(':', 1) 258 except ValueError: 259 raise HTTPError( 260 "Malformed HTTP header line in response: %r" % (line,)) 261 262 value = value.strip() 263 264 # HTTP headers are case-insensitive 265 name = name.lower() 266 headers[name] = value 267 268 return headers
269
270 - def _checkURL(self, url):
271 # XXX: document that this can be overridden to match desired policy 272 # XXX: make sure url is well-formed and routeable 273 return _allowedURL(url)
274
275 - def fetch(self, url, body=None, headers=None):
276 stop = int(time.time()) + self.ALLOWED_TIME 277 off = self.ALLOWED_TIME 278 279 if headers is None: 280 headers = {} 281 282 headers.setdefault('User-Agent', 283 "%s %s" % (USER_AGENT, pycurl.version,)) 284 285 header_list = [] 286 if headers is not None: 287 for header_name, header_value in headers.iteritems(): 288 header_list.append('%s: %s' % (header_name, header_value)) 289 290 c = pycurl.Curl() 291 try: 292 c.setopt(pycurl.NOSIGNAL, 1) 293 294 if header_list: 295 c.setopt(pycurl.HTTPHEADER, header_list) 296 297 # Presence of a body indicates that we should do a POST 298 if body is not None: 299 c.setopt(pycurl.POST, 1) 300 c.setopt(pycurl.POSTFIELDS, body) 301 302 while off > 0: 303 if not self._checkURL(url): 304 raise HTTPError("Fetching URL not allowed: %r" % (url,)) 305 306 data = cStringIO.StringIO() 307 response_header_data = cStringIO.StringIO() 308 c.setopt(pycurl.WRITEFUNCTION, data.write) 309 c.setopt(pycurl.HEADERFUNCTION, response_header_data.write) 310 c.setopt(pycurl.TIMEOUT, off) 311 c.setopt(pycurl.URL, openid.urinorm.urinorm(url)) 312 313 c.perform() 314 315 response_headers = self._parseHeaders(response_header_data) 316 code = c.getinfo(pycurl.RESPONSE_CODE) 317 if code in [301, 302, 303, 307]: 318 url = response_headers.get('location') 319 if url is None: 320 raise HTTPError( 321 'Redirect (%s) returned without a location' % code) 322 323 # Redirects are always GETs 324 c.setopt(pycurl.POST, 0) 325 326 # There is no way to reset POSTFIELDS to empty and 327 # reuse the connection, but we only use it once. 328 else: 329 resp = HTTPResponse() 330 resp.headers = response_headers 331 resp.status = code 332 resp.final_url = url 333 resp.body = data.getvalue() 334 return resp 335 336 off = stop - int(time.time()) 337 338 raise HTTPError("Timed out fetching: %r" % (url,)) 339 finally: 340 c.close()
341
342 -class HTTPLib2Fetcher(HTTPFetcher):
343 """A fetcher that uses C{httplib2} for performing HTTP 344 requests. This implementation supports HTTP caching. 345 346 @see: http://bitworking.org/projects/httplib2/ 347 """ 348
349 - def __init__(self, cache=None):
350 """@param cache: An object suitable for use as an C{httplib2} 351 cache. If a string is passed, it is assumed to be a 352 directory name. 353 """ 354 if httplib2 is None: 355 raise RuntimeError('Cannot find httplib2 library. ' 356 'See http://bitworking.org/projects/httplib2/') 357 358 super(HTTPLib2Fetcher, self).__init__() 359 360 # An instance of the httplib2 object that performs HTTP requests 361 self.httplib2 = httplib2.Http(cache) 362 363 # We want httplib2 to raise exceptions for errors, just like 364 # the other fetchers. 365 self.httplib2.force_exception_to_status_code = False
366
367 - def fetch(self, url, body=None, headers=None):
368 """Perform an HTTP request 369 370 @raises Exception: Any exception that can be raised by httplib2 371 372 @see: C{L{HTTPFetcher.fetch}} 373 """ 374 if body: 375 method = 'POST' 376 else: 377 method = 'GET' 378 379 # httplib2 doesn't check to make sure that the URL's scheme is 380 # 'http' so we do it here. 381 if not (url.startswith('http://') or url.startswith('https://')): 382 raise ValueError('URL is not a HTTP URL: %r' % (url,)) 383 384 httplib2_response, content = self.httplib2.request( 385 url, method, body=body, headers=headers) 386 387 # Translate the httplib2 response to our HTTP response abstraction 388 389 # When a 400 is returned, there is no "content-location" 390 # header set. This seems like a bug to me. I can't think of a 391 # case where we really care about the final URL when it is an 392 # error response, but being careful about it can't hurt. 393 try: 394 final_url = httplib2_response['content-location'] 395 except KeyError: 396 # We're assuming that no redirects occurred 397 assert not httplib2_response.previous 398 399 # And this should never happen for a successful response 400 assert httplib2_response.status != 200 401 final_url = url 402 403 return HTTPResponse( 404 body=content, 405 final_url=final_url, 406 headers=dict(httplib2_response.items()), 407 status=httplib2_response.status, 408 )
409