"""
metadata.models
===============
.. autoclass:: metadata.models.Resource
:members:
:private-members: _article _decode
"""
from django.db import models
from goose import Goose
from crawler.models import WebDocument
[docs]class Resource(models.Model):
"""ORM class wrapping persistent data of the web resource
Contains hooks into the code for resource processing
"""
url = models.CharField(max_length=1024)
_hash = models.CharField(
db_column='hash', max_length=512,
blank=True, null=True)
protocol = models.CharField(max_length=6, null=True, blank=True)
contenttype = models.CharField(max_length=512, null=True, blank=True)
host = models.CharField(max_length=512, null=True, blank=True)
port = models.IntegerField(null=True, blank=True)
path = models.TextField(null=True, blank=True)
depth = models.IntegerField(null=True, blank=True)
lastFetchDateTime = models.DateTimeField(null=True, blank=True)
def __unicode__(self):
return self.url
[docs] def _decode(self):
"""Lookup content of the coresponding WebDocument.document"""
# cache this method
wd = WebDocument.objects.filter(url=self.url).get()
return wd.document
[docs] def _article(self):
"""Analyse resource content, return Goose interface"""
# switch method depending on content_type
# for pdf, fall back to teseract if pdf2text yields not much
# (then use the larger, or maybe composit)
g = Goose()
return g.extract(raw_html=self._decode())
[docs] def title(self):
"""Attempt to produce a single line description of the resource"""
# assumes Goose interface
try:
return self._article().title
except:
return "(no title)"
[docs] def excerpt(self):
"""Attempt to produce a plain text version of resource content"""
# memcache this,
# would require working evict-on-save (use signals, test it)
try:
return self._article().cleaned_text
except:
return "(no text)"
def get_absolute_url(self):
return "%s" % self.url
[docs] def sr_summary(self):
'''
Search result summary.
This is a rude hack, it doesn't even break on word boundaries.
There should be much smarter ways of doing this.
'''
long_excerpt = self.excerpt()
if len(long_excerpt) < 300:
return long_excerpt
else:
return long_excerpt[:300]